def load_rearrangement(filename, validate=False, debug=False): """ Load the contents of an AIRR rearrangements file into a data frame Arguments: filename (str): input file path. validate (bool): whether to validate data as it is read, raising a ValidationError exception in the event of an error. debug (bool): debug flag. If True print debugging information to standard error. Returns: pandas.DataFrame: Rearrangement records as rows of a data frame. """ # TODO: test pandas.DataFrame.read_csv with converters argument as an alterative schema = RearrangementSchema df = pd.read_csv(filename, sep='\t', header=0, index_col=None, dtype=schema.pandas_types(), true_values=schema.true_values, false_values=schema.false_values) # added to use RearrangementReader without modifying it: buffer = StringIO() # create an empty buffer df.to_csv(buffer, sep='\t', index=False) # fill buffer buffer.seek(0) # set to the start of the stream reader = RearrangementReader(buffer, validate=validate, debug=debug) df = pd.DataFrame(list(reader)) return df
def merge_rearrangement(out_filename, in_filenames, drop=False, debug=False): """ Merge one or more AIRR rearrangements files Arguments: out_filename (str): output file path. in_filenames (list): list of input files to merge. drop (bool): drop flag. If True then drop fields that do not exist in all input files, otherwise combine fields from all input files. debug (bool): debug flag. If True print debugging information to standard error. Returns: bool: True if files were successfully merged, otherwise False. """ try: # gather fields from input files readers = (RearrangementReader(open(f, 'r'), debug=False) for f in in_filenames) field_list = [x.fields for x in readers] if drop: field_set = set.intersection(*map(set, field_list)) else: field_set = set.union(*map(set, field_list)) field_order = OrderedDict([(f, None) for f in chain(*field_list)]) out_fields = [f for f in field_order if f in field_set] # write input files to output file sequentially readers = (RearrangementReader(open(f, 'r'), debug=debug) for f in in_filenames) with open(out_filename, 'w+') as handle: writer = RearrangementWriter(handle, fields=out_fields, debug=debug) for reader in readers: for r in reader: writer.write(r) reader.close() except Exception as e: sys.stderr.write( 'Error occurred while merging AIRR rearrangement files: %s\n' % e) return False return True
def read_rearrangement(filename, validate=False, debug=False): """ Open an iterator to read an AIRR rearrangements file Arguments: file (str): path to the input file. validate (bool): whether to validate data as it is read, raising a ValidationError exception in the event of an error. debug (bool): debug flag. If True print debugging information to standard error. Returns: airr.io.RearrangementReader: iterable reader class. """ return RearrangementReader(open(filename, 'r'), validate=validate, debug=debug)
def validate_rearrangement(filename, debug=False): """ Validates one or more AIRR rearrangements files Arguments: filename (str): path of the file to validate. debug (bool): debug flag. If True print debugging information to standard error. Returns: bool: True if files passed validation, otherwise False. """ valid = True if debug: sys.stderr.write('Validating: %s\n' % filename) # Open reader handle = open(filename, 'r') reader = RearrangementReader(handle, validate=True) # Validate header try: iter(reader) except ValidationError as e: valid = False if debug: sys.stderr.write('%s has validation error: %s\n' % (filename, e)) # Validate each row i = 0 while True: try: i = i + 1 next(reader) except StopIteration: break except ValidationError as e: valid = False if debug: sys.stderr.write('%s at record %i has validation error: %s\n' % (filename, i, e)) # Close handle.close() return valid
def derive_rearrangement(out_filename, in_filename, fields=None, debug=False): """ Create an empty AIRR rearrangements file with fields derived from an existing file Arguments: out_filename (str): output file path. in_filename (str): existing file to derive fields from. fields (list): additional non-required fields to add to the output. debug (bool): debug flag. If True print debugging information to standard error. Returns: airr.io.RearrangementWriter: open writer class. """ reader = RearrangementReader(open(in_filename, 'r')) in_fields = list(reader.fields) if fields is not None: in_fields.extend([f for f in fields if f not in in_fields]) return RearrangementWriter(open(out_filename, 'w+'), fields=in_fields, debug=debug)
def load_rearrangement(filename, validate=False, debug=False): """ Load the contents of an AIRR rearrangements file into a data frame Arguments: filename (str): input file path. validate (bool): whether to validate data as it is read, raising a ValidationError exception in the event of an error. debug (bool): debug flag. If True print debugging information to standard error. Returns: pandas.DataFrame: Rearrangement records as rows of a data frame. """ # TODO: test pandas.DataFrame.read_csv with converters argument as an alterative # schema = RearrangementSchema # df = pd.read_csv(handle, sep='\t', header=0, index_col=None, # dtype=schema.numpy_types(), true_values=schema.true_values, # false_values=schema.true_values) # return df with open(filename, 'r') as handle: reader = RearrangementReader(handle, validate=validate, debug=debug) df = pd.DataFrame(list(reader)) return df
def processAIRRTSVFile(self, file_handle, path): # Start a timer for performance reasons. t_start_full = time.perf_counter() # Get the AIRR Map object for this class (for convenience). airr_map = self.getAIRRMap() # Set the tag for the repository that we are using. repository_tag = self.getRepositoryTag() # Set the tag for the iReceptor identifier. ireceptor_tag = self.getiReceptorTag() # Get the fields to use for finding repertoire IDs, either using those IDs # directly or by looking for a repertoire ID based on a rearrangement file # name. repertoire_link_field = self.getRepertoireLinkIDField() rearrangement_link_field = self.getRearrangementLinkIDField() rearrangement_file_field = self.getRearrangementFileField() # Set the tag for the file mapping that we are using. Ths is essentially the # look up into the columns of the AIRR Mapping that we are using. For the IgBLAST # parser it is normally the "igblast" column (which is essentially the same as # AIRR TSV), but it can be overridden by the user. filemap_tag = self.getFileMapping() # Set the size of each chunk of data that is inserted. chunk_size = self.getRepositoryChunkSize() # Validate the AIRR TSV file header. We do not validate the entire # file becasue that is too expensive of an operation. # Validate header by trying to read the first record. If it throws # an error then we have a problem. airr_reader = RearrangementReader(file_handle, validate=True, debug=True) airr_valid = True try: airr_iterator = iter(airr_reader) first_record = next(airr_iterator) except ValidationError as e: airr_valid = False print("ERROR: File %s is not a valid AIRR TSV file, %s" % (path, e)) return False if airr_valid: print("Info: File %s has a valid AIRR TSV header" % (path)) # Get root filename from the path filename = os.path.basename(path) # Get the single, unique repertoire link id for the filename we are loading. If # we can't find one, this is an error and we return failure. repertoire_link_id = self.getRepertoireInfo(filename) if repertoire_link_id is None: print("ERROR: Could not link file %s to a valid repertoire" % (filename)) return False # Extract the fields that are of interest for this file. Essentiall all non null # fields in the file. This is a boolean array that is T everywhere there is a # notnull field in the column of interest. map_column = airr_map.getRearrangementMapColumn(filemap_tag) fields_of_interest = map_column.notnull() # We select the rows in the mapping that contain fields of interest from the file. # At this point, file_fields contains N columns that contain our mappings for the # the specific formats (e.g. iReceptor, AIRR, VQuest). The rows are limited to # only data that is relevant to the file format column of interest. file_fields = airr_map.getRearrangementRows(fields_of_interest) # We need to build the set of fields that the repository can store. We don't # want to extract fields that the repository doesn't want. igblastColumns = [] columnMapping = {} if self.verbose(): print("Info: Dumping expected %s (%s) to repository mapping" % (self.getAnnotationTool(), filemap_tag)) for index, row in file_fields.iterrows(): if self.verbose(): print("Info: %s -> %s" % (str(row[filemap_tag]), str(row[repository_tag]))) # If the repository column has a value for the field in the file, track the # field from both the file and repository side. if not pd.isnull(row[repository_tag]): igblastColumns.append(row[filemap_tag]) columnMapping[row[filemap_tag]] = row[repository_tag] else: print("Info: Repository does not support " + str(row[filemap_tag]) + ", not inserting into repository") # Get the field names from the file from the airr_reader object. # Determing the mapping from the file input to the repository. finalMapping = {} for airr_field in airr_reader.fields: if airr_field in columnMapping: if self.verbose(): print("Info: Mapping %s field in file: %s -> %s" % (self.getAnnotationTool(), airr_field, columnMapping[airr_field])) finalMapping[airr_field] = columnMapping[airr_field] else: if self.verbose(): print("Info: No mapping for input " + self.getAnnotationTool() + " field " + airr_field + ", adding to repository without mapping.") # Determine if we are missing any repository columns from the input data. for igblast_column, mongo_column in columnMapping.items(): if not igblast_column in airr_reader.fields: if self.verbose(): print("Info: Missing data in input " + self.getAnnotationTool() + " file for " + igblast_column) # Create a reader for the data frame with step size "chunk_size" if self.verbose(): print("Info: Processing raw data frame...") airr_df_reader = pd.read_csv(path, sep='\t', chunksize=chunk_size) # Iterate over the file with data frames of size "chunk_size" total_records = 0 for airr_df in airr_df_reader: # Remap the column names. We need to remap because the columns may be in a # differnt order in the file than in the column mapping. airr_df.rename(finalMapping, axis='columns', inplace=True) # Build the substring array that allows index for fast searching of # Junction AA substrings. junction_aa = airr_map.getMapping("junction_aa", ireceptor_tag, repository_tag) ir_substring = airr_map.getMapping("ir_substring", ireceptor_tag, repository_tag) ir_junc_aa_len = airr_map.getMapping("ir_junction_aa_length", ireceptor_tag, repository_tag) if junction_aa in airr_df: if self.verbose(): print( "Info: Retrieving junction AA and building substrings", flush=True) airr_df[ir_substring] = airr_df[junction_aa].apply( Rearrangement.get_substring) # The AIRR TSV format doesn't have AA length, we want it in repository. if not (ir_junc_aa_len in airr_df): if self.verbose(): print("Info: Computing junction amino acids length...", flush=True) airr_df[ir_junc_aa_len] = airr_df[junction_aa].apply( Parser.len_null_to_null) # We need to look up the "known parameter" from an iReceptor perspective (the # field name in the iReceptor column mapping and map that to the correct # field name for the repository we are writing to. v_call = airr_map.getMapping("v_call", ireceptor_tag, repository_tag) d_call = airr_map.getMapping("d_call", ireceptor_tag, repository_tag) j_call = airr_map.getMapping("j_call", ireceptor_tag, repository_tag) ir_vgene_gene = airr_map.getMapping("ir_vgene_gene", ireceptor_tag, repository_tag) ir_dgene_gene = airr_map.getMapping("ir_dgene_gene", ireceptor_tag, repository_tag) ir_jgene_gene = airr_map.getMapping("ir_jgene_gene", ireceptor_tag, repository_tag) ir_vgene_family = airr_map.getMapping("ir_vgene_family", ireceptor_tag, repository_tag) ir_dgene_family = airr_map.getMapping("ir_dgene_family", ireceptor_tag, repository_tag) ir_jgene_family = airr_map.getMapping("ir_jgene_family", ireceptor_tag, repository_tag) # Build the v_call field, as an array if there is more than one gene # assignment made by the annotator. self.processGene(airr_df, v_call, v_call, ir_vgene_gene, ir_vgene_family) self.processGene(airr_df, j_call, j_call, ir_jgene_gene, ir_jgene_family) self.processGene(airr_df, d_call, d_call, ir_dgene_gene, ir_dgene_family) # If we don't already have a locus (that is the data file didn't provide one) # then calculate the locus based on the v_call array. locus = airr_map.getMapping("locus", ireceptor_tag, repository_tag) if not locus in airr_df: airr_df[locus] = airr_df[v_call].apply(Rearrangement.getLocus) # Keep track of the reperotire id so can link each rearrangement to # a repertoire rep_rearrangement_link_field = airr_map.getMapping( rearrangement_link_field, ireceptor_tag, repository_tag) airr_df[rep_rearrangement_link_field] = repertoire_link_id # Set the relevant IDs for the record being inserted. If it fails, don't # load any data. if not self.checkIDFields(airr_df, repertoire_link_id): return False # Create the created and update values for this block of records. Note that # this means that each block of inserts will have the same date. now_str = Rearrangement.getDateTimeNowUTC() ir_created_at = airr_map.getMapping("ir_created_at", ireceptor_tag, repository_tag) ir_updated_at = airr_map.getMapping("ir_updated_at", ireceptor_tag, repository_tag) airr_df[ir_created_at] = now_str airr_df[ir_updated_at] = now_str # Transform the data frame so that it meets the repository type requirements if not self.mapToRepositoryType(airr_df): print("ERROR: Unable to map data to the repository") return False # Insert the chunk of records into Mongo. num_records = len(airr_df) print("Info: Inserting", num_records, "records into Mongo...", flush=True) t_start = time.perf_counter() records = json.loads(airr_df.T.to_json()).values() self.repositoryInsertRearrangements(records) t_end = time.perf_counter() print("Info: Inserted records, time =", (t_end - t_start), "seconds", flush=True) # Keep track of the total number of records processed. total_records = total_records + num_records print("Info: Total records so far =", total_records, flush=True) # Get the number of annotations for this repertoire (as defined by the # repertoire link id if self.verbose(): print("Info: Getting the number of annotations for repertoire %s" % (str(repertoire_link_id))) annotation_count = self.repositoryCountRearrangements( repertoire_link_id) if annotation_count == -1: print("ERROR: invalid annotation count (%d), write failed." % (annotation_count)) return False if self.verbose(): print("Info: Annotation count = %d" % (annotation_count), flush=True) # Set the cached sequence count field for the repertoire. self.repositoryUpdateCount(repertoire_link_id, annotation_count) # Inform on what we added and the total count for the this record. t_end_full = time.perf_counter() print( "Info: Inserted %d records, annotation count = %d, %f s, %f insertions/s" % (total_records, annotation_count, t_end_full - t_start_full, total_records / (t_end_full - t_start_full)), flush=True) return True