def test_load_repertoire(self): # Good data try: data = airr.load_repertoire(self.rep_good, validate=True) except: self.assertTrue(False, 'load_repertoire(): good data failed') # Bad data try: data = airr.load_repertoire(self.rep_good, validate=True) self.assertFalse(True, 'load_repertoire(): bad data failed') except: pass
def test_load_repertoire(self): # Good data try: data = airr.load_repertoire(self.rep_good, validate=True) except: self.assertTrue(False, 'load_repertoire(): good data failed') # Bad data try: data = airr.load_repertoire(self.rep_bad, validate=True, debug=True) self.assertFalse(True, 'load_repertoire(): bad data failed') except ValidationError: pass except Exception as inst: print(type(inst)) raise inst
print("ELAPSED DOWNLOAD TIME (in hours): %s" % (total_time / 3600)) filename = str(query_files.split("/")[-1].split(".")[0]) + "_" + str( study_id) + "__OUT.json" json_data = parse_query( query_json, str(details_dir) + str(query_files.split("/")[-1].split(".")[0]) + "_" + str(study_id) + "_") # # Uncomment when AIRR test is ready to be used again if entry_pt == "repertoire": print("In repertoire entry point", entry_pt) try: airr.load_repertoire(str(details_dir) + filename, validate=True) print("Successful repertoire loading - AIRR test passed\n") except airr.ValidationError as err: print("ERROR: AIRR repertoire validation failed for file %s - %s" % (filename, err)) print("\n") print( "---------------------------------------------------------------------------------------------------------------------------------------------------" ) #Begin sanity checking print( "########################################################################################################" ) print( "---------------------------------------VERIFY FILES ARE HEALTHY-----------------------------------------\n"
import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import numpy as np # We have 4 T cell subsets subsets = { 'CL_0000895': [0 for number in range(0, 50)], 'CL_0000900': [0 for number in range(0, 50)], 'CL_0000897': [0 for number in range(0, 50)], 'CL_0000909': [0 for number in range(0, 50)] } # Load the repertoire metadata data = airr.load_repertoire('repertoires.airr.json') repertoires = {obj['repertoire_id']: obj for obj in data['Repertoire']} # Iterate through the rearrangement data and tabulate the counts reader = airr.read_rearrangement('rearrangements.tsv') for row in reader: # get the appropriate repertoire rep = repertoires[row['repertoire_id']] # use the cell_subset field in the repertoire c = subsets[rep['sample'][0]['cell_subset']['id']] # increment the length count if row['junction_aa_length']: if int(row['junction_aa_length']) >= 50: continue #print(int(row['junction_aa_length'])) c[int(row['junction_aa_length'])] += 1
cfg['password'] = os.getenv('MONGODB_SECRET') return cfg else: print('ERROR: loading config') return None # connection header config = getConfig() header = 'var conn = new Mongo();\n' header += 'var db = conn.getDB("admin");\n' header += 'db.auth("' + config['service_user'] + '", "' + config[ 'service_secret'] + '");\n' header += 'db = db.getSiblingDB("' + config['db'] + '");\n' os.system("mkdir /work_data/tmp") fname = '/work_data/tmp/repertoire.js' print('Creating file: ' + fname) fout = open(fname, 'w') fout.write(header) # TODO: This should use the AIRR python load_repertoire() data = airr.load_repertoire('/work/florian.airr.yaml') reps = data['Repertoire'] for r in reps: fout.write('db.repertoire.deleteOne({"repertoire_id":"' + r['repertoire_id'] + '"});\n') fout.write('db.repertoire.insertOne(' + json.dumps(r) + ');\n') fout.close()
def airrdownload(args): airr.validate_repertoire(args.repertoire, True) repertoire_file = args.repertoire rearrangements_file = repertoire_file[:-4] + "rearrangements.tsv" try: data = airr.load_repertoire(args.repertoire) except TypeError: sys.stderr.write("TCRcloud error: It seems you did not indicate a \ properly formatted AIRR rearrangements file\n") exit() repertoires = data["Repertoire"] host_url = testserver(data) # Print out some Info print(" Info: " + data["Info"]["title"]) print(" version: " + str(data["Info"]["version"])) print("description: " + data["Info"]["description"]) print("Found " + str(len(data["Repertoire"])) + " repertoires in \ repertoire metadata file.") # Query the rearrangement endpoint # Define a generic query object, and we will replace the repertoire_id # within the loop. We also only request productive rearrangements as # an additional filter. query = { "filters": { "op": "and", "content": [ { "op": "=", "content": { "field": "repertoire_id", "value": "XXX" } }, { "op": "=", "content": { "field": "productive", "value": True } } ] }, "size": 1000, "from": 0 } # Loop through each repertoire and query rearrangement data for # each. We download in chunks of 10000 because of the server # limitations using the from and size parameters. first = True for r in repertoires: print("Retrieving rearrangements for repertoire: " + r["repertoire_id"]) query["filters"]["content"][0]["content"]["value"] = r["repertoire_id"] query["size"] = 1000 query["from"] = 0 cnt = 0 while True: # send the request resp = requests.post(host_url + "/rearrangement", json=query) data = resp.json() rearrangements = data["Rearrangement"] # Open a file for writing the rearrangements. We do this here # because we need to know the full set of fields being # returned from the data repository, otherwise by default only # the required fields will be written to the file. if first: out_file = airr.create_rearrangement( rearrangements_file, fields=rearrangements[0].keys()) first = False # save the rearrangements to a file for row in rearrangements: out_file.write(row) # looping until zero rearrangements are returned from the query. cnt += len(rearrangements) if len(rearrangements) < 1000: break # Need to update the from parameter to get the next chunk query["from"] = cnt print("Retrieved " + str(cnt) + " rearrangements for repertoire: " + r["repertoire_id"]) print("Saved as " + rearrangements_file)
def process(self, filename): # Check to see if we have a file if not os.path.isfile(filename): print("ERROR: input file " + filename + " is not a file") return False # Get the column tag for the iReceptor mapping ireceptor_tag = self.getiReceptorTag() # Get the column tag for the iReceptor mapping repository_tag = self.getRepositoryTag() # Check the validity of the repertoires from an AIRR perspective try: data = airr.load_repertoire(filename, validate=True) except airr.ValidationError as err: print("ERROR: AIRR repertoire validation failed for file %s - %s" % (filename, err)) return False except Exception as err: print("ERROR: AIRR repertoire validation failed for file %s - %s" % (filename, err)) return False # Get the fields to use for finding repertoire IDs, either using those IDs # directly or by looking for a repertoire ID based on a rearrangement file # name. repertoire_id_field = self.getRepertoireLinkIDField() rearrangement_file_field = self.getRearrangementFileField() # The 'Repertoire' contains a dictionary for each repertoire. repertoire_list = [] for repertoire in data['Repertoire']: repertoire_dict = dict() for key, value in repertoire.items(): try: self.ir_flatten(key, value, repertoire_dict) except TypeError as error: print("ERROR: %s" % (error)) return False # Ensure that we have a correct file name to link fields. If we can't find it # this is a fatal error as we can not link any data to this set repertoire, # so there is no point adding the repertoire... repository_file_field = self.getAIRRMap().getMapping( rearrangement_file_field, ireceptor_tag, repository_tag) # If we can't find a mapping for this field in the repository mapping, then # we might still be OK if the metadata spreadsheet has the field. If the fails, # then we should exit. if repository_file_field is None or len( repository_file_field) == 0: print( "Warning: No repository mapping for the rearrangement file field (%s)" % (rearrangement_file_field)) repository_file_field = rearrangement_file_field # If we can't find the file field for the rearrangement field in the repository, then # abort, as we won't be able to link the repertoire to the rearrangement. if not repository_file_field in repertoire_dict: print( "ERROR: Could not find a rearrangement file field in the metadata (%)" % (rearrangement_file_field)) print( "ERROR: Will not be able to link repertoire to rearrangement annotations" ) return False repertoire_list.append(repertoire_dict) # Iterate over the list and load records. Note that this code inserts all data # that was read in. That is, all of the non MiAIRR fileds that exist # are stored in the repository. So if the provided file has lots of extra fields # they will exist in the repository. # TODO: Ensure that all records are written as the correct type for the repository. for r in repertoire_list: if self.repositoryInsertRepertoire(r) is None: return False # If we made it here we are DONE! return True