def extract_recessive_disorder_candidates(input, output, gene_column_name='Gene.refGene', zygozity_column_name='Otherinfo', delim=','): """ extract a part of the annotated table that contains candidates for recessive disorders """ table_in = open(input, 'r') # get right column indexes header = quote_aware_split(table_in.readline().strip(), delim) gene_col_index = header.index(gene_column_name) zygozity_col_index = header.index(zygozity_column_name) variant_records_per_gene = {} for l in table_in.xreadlines(): lsplit = quote_aware_split(l, delim) gene = lsplit[gene_col_index].strip('"') # the gene record can be a list (e.g. overlapping genes), so it needs to be split genes = [gene] if gene.find(',') >= 0: genes = parenthesis_aware_split(gene, delim=',') genes = [parenthesis_aware_split(gene, delim=';') for gene in genes] genes = set([gene for sublist in genes for gene in sublist ]) # unlist and get unique gene ids only for gene in genes: # if present, remove suffix in parenthesis if gene.find('(') >= 0: gene = gene[:gene.find('(')] # put the variant record in the map try: variant_records_per_gene[gene] += [l] except KeyError: variant_records_per_gene[gene] = [l] table_in.close() # write the table with candidates for recessive inheritance model table_out = open(output, 'w') table_out.write(delim.join(header) + '\n') # iterate over the genes and select... for gene in variant_records_per_gene: # ...these with 2 or more variants... if len(variant_records_per_gene[gene]) >= 2: for l in variant_records_per_gene[gene]: table_out.write(l) # or homozygous variants else: lsplit = quote_aware_split(variant_records_per_gene[gene][0]) if lsplit[zygozity_col_index].find('"hom\t') >= 0: table_out.write(variant_records_per_gene[gene][0]) table_out.close()
def extract_recessive_disorder_candidates( input, output, gene_column_name="Gene.refGene", zygozity_column_name="Otherinfo", delim="," ): """ extract a part of the annotated table that contains candidates for recessive disorders """ table_in = open(input, "r") # get right column indexes header = quote_aware_split(table_in.readline().strip(), delim) gene_col_index = header.index(gene_column_name) zygozity_col_index = header.index(zygozity_column_name) variant_records_per_gene = {} for l in table_in.xreadlines(): lsplit = quote_aware_split(l, delim) gene = lsplit[gene_col_index].strip('"') # the gene record can be a list (e.g. overlapping genes), so it needs to be split genes = [gene] if gene.find(",") >= 0: genes = parenthesis_aware_split(gene, delim=",") genes = [parenthesis_aware_split(gene, delim=";") for gene in genes] genes = set([gene for sublist in genes for gene in sublist]) # unlist and get unique gene ids only for gene in genes: # if present, remove suffix in parenthesis if gene.find("(") >= 0: gene = gene[: gene.find("(")] # put the variant record in the map try: variant_records_per_gene[gene] += [l] except KeyError: variant_records_per_gene[gene] = [l] table_in.close() # write the table with candidates for recessive inheritance model table_out = open(output, "w") table_out.write(delim.join(header) + "\n") # iterate over the genes and select... for gene in variant_records_per_gene: # ...these with 2 or more variants... if len(variant_records_per_gene[gene]) >= 2: for l in variant_records_per_gene[gene]: table_out.write(l) # or homozygous variants else: lsplit = quote_aware_split(variant_records_per_gene[gene][0]) if lsplit[zygozity_col_index].find('"hom\t') >= 0: table_out.write(variant_records_per_gene[gene][0]) table_out.close()
def find_geneset_hits_in_samples(sample_files, geneset, gene_column_name = 'Gene.refGene', delim=',', has_header=True, fields=None): """ Iterate over the per-sample variant tables (csv, tsv) and look for genes from the geneset. The gene column is given in the gene_column_name argument: either by name (if has_header==True), or by index of the column (if has_header==False). Returns a dict in form {sample: {gene1:[variant_line], gene2:[variant_line1, variant_line2]}}, where gene1 and gene2 belong to the geneset, and variant_line is the entire line from csv file. As the same gene can appear twice with distinct variants, the dict[sample][gene] is a list. """ map={s:{} for s in sample_files} for fname in sample_files: sys.stderr.write('Processing '+ fname + '...') f = open(fname) gene_col_index = -1 if has_header: header = quote_aware_split(f.readline().strip(), delim) gene_col_index = header.index(gene_column_name) else: gene_col_index = int(gene_column_name) for l in f.xreadlines(): gene_entry = quote_aware_split(l, delim)[gene_col_index] gene_entry = gene_entry.strip().strip('"') # clean the gene name genes=[gene_entry] if gene_entry.find(',') >= 0: genes = parenthesis_aware_split(gene_entry,delim=',') # split multi gene entries genes = [parenthesis_aware_split(gene,delim=';') for gene in genes] genes = set([gene for sublist in genes for gene in sublist]) # get unique gene ids only for gene in genes: if gene.find('(') > 0: gene = gene[:gene.find('(')] # strip the transcript change in parenthesis if gene in geneset: # clean the records variant_record = '\t'.join([e.strip().strip('"') for e in quote_aware_split(l, delim)]) # if requested, select a subset of fields if fields != None: variant_record = '\t'.join([variant_record.split('\t')[i] for i in fields]) try: map[fname][gene] += [variant_record] except KeyError: map[fname][gene] = [variant_record] f.close() sys.stderr.write('done\n') return map
def include_omim_phenotype_annotation(inputs, output_table, gene_column=7, omim_column=15, delim=','): """ include OMIM phenotype into the annotation table """ table_in = open(inputs[1], 'r') table_out = open(output_table, 'w') # header header_in = quote_aware_split(table_in.readline(), delim) if omim_column <= 0: omim_column = len(header_in) + 1 header_out = header_in[:omim_column - 1] + ['omim_phenotype' ] + header_in[omim_column - 1:] table_out.write(delim.join(header_out)) # the rest of the table for l in table_in.xreadlines(): lsplit = quote_aware_split(l, delim) gene = lsplit[gene_column - 1].strip('"') # the gene record can be a list (e.g. overlapping genes), so it needs to be split genes = [gene] if gene.find(',') >= 0: genes = parenthesis_aware_split(gene, delim=',') genes = [parenthesis_aware_split(gene, delim=';') for gene in genes] genes = set([gene for sublist in genes for gene in sublist ]) # unlist and get unique gene ids only for gene in genes: # if present, remove suffix in parenthesis if gene.find('(') >= 0: gene = gene[:gene.find('(')] # put the variant record in the map try: omim_phenotype = omim_gene_phenotype_map[gene] except KeyError: omim_phenotype = 'NA' table_out.write( delim.join(lsplit[:omim_column - 1] + ['"' + omim_phenotype + '"'] + lsplit[omim_column - 1:])) table_in.close() table_out.close()
def include_omim_phenotype_annotation(inputs, output_table, gene_column=7, omim_column=15, delim=","): """ include OMIM phenotype into the annotation table """ table_in = open(inputs[1], "r") table_out = open(output_table, "w") # header header_in = quote_aware_split(table_in.readline(), delim) if omim_column <= 0: omim_column = len(header_in) + 1 header_out = header_in[: omim_column - 1] + ["omim_phenotype"] + header_in[omim_column - 1 :] table_out.write(delim.join(header_out)) # the rest of the table for l in table_in.xreadlines(): lsplit = quote_aware_split(l, delim) gene = lsplit[gene_column - 1].strip('"') # the gene record can be a list (e.g. overlapping genes), so it needs to be split genes = [gene] if gene.find(",") >= 0: genes = parenthesis_aware_split(gene, delim=",") genes = [parenthesis_aware_split(gene, delim=";") for gene in genes] genes = set([gene for sublist in genes for gene in sublist]) # unlist and get unique gene ids only for gene in genes: # if present, remove suffix in parenthesis if gene.find("(") >= 0: gene = gene[: gene.find("(")] # put the variant record in the map try: omim_phenotype = omim_gene_phenotype_map[gene] except KeyError: omim_phenotype = "NA" table_out.write( delim.join(lsplit[: omim_column - 1] + ['"' + omim_phenotype + '"'] + lsplit[omim_column - 1 :]) ) table_in.close() table_out.close()