def concat_intervals(e): intervals = e.xpath(".//INSDInterval") intervals2 = [] for i in intervals: start = text_at_node(i, './/INSDInterval_from') stop = text_at_node(i, './/INSDInterval_to') intervals2.append((start, stop)) if intervals: return ';'.join([','.join(pair) for pair in intervals2]) else: return None, None
def get_fasta_list(): get_list_attempts = 3 page = None while get_list_attempts > 0: try: page = requests.get(base_url) break except Exception as e: if get_list_attempts == 1: logger.exception( 'Attempt to fetch the list of fasta files continues to fail.' ) raise e logger.error( 'Attept to fetch the list of fasta files failed. New attmept in 10 seconds.' ) sleep(10) get_list_attempts -= 1 tree = html.fromstring(page.content) file_list = tree.xpath('/html/body/pre/a') file_list = file_list[1:] # removes '../' file_list = [ text_at_node(a_node, '.', mandatory=True) for a_node in file_list ] file_list = [item for item in file_list if item.startswith('NMDC') ] # keep only the sequences of SARS-COV return file_list
def do(): # download and write the taxonomy tree for the taxon id destination_file_path = f"{containing_directory}{sep}{taxon_name}.xml" if not exists(destination_file_path): # get taxon_id with Entrez.esearch(db="taxonomy", term=f'"{taxon_name}"', rettype=None, retmode="xml", tool=entrez_config[0], email=entrez_config[1], api_key=entrez_config[2]) as id_search: tree: etree.ElementTree = etree.parse( source=id_search, parser=etree.XMLParser(remove_blank_text=True)) taxon_id = text_at_node(tree, '/eSearchResult/IdList/Id', mandatory=True) # download data for taxon_id with Entrez.efetch(db="taxonomy", id=taxon_id, rettype=None, retmode="xml", tool=entrez_config[0], email=entrez_config[1], api_key=entrez_config[2]) as handle: with open(destination_file_path, 'w') as f: f.write(handle.read()) return destination_file_path
def __init__(self, xml_tree_file_path: str): self.tax_tree: etree.ElementTree = \ etree.parse(xml_tree_file_path, parser=etree.XMLParser(remove_blank_text=True)) \ .xpath('/TaxaSet/Taxon')[0] rank = self.tax_tree.xpath( './Rank') # xpath returns a list also for single nodes if rank: self.suggested_from_other_method[text_at_node( rank[0], '.').lower()] = self.taxon_name()
def equivalent_names(self): genbank_acronym = text_at_node(self.tax_tree, './/GenbankAcronym', mandatory=False) equivalent_names = self.tax_tree.xpath('.//EquivalentName') equivalent_names = [x.text for x in equivalent_names] if genbank_acronym: equivalent_names.insert(0, genbank_acronym) equivalent_names = list(OrderedDict.fromkeys(equivalent_names)) equivalent_names = ", ".join(equivalent_names) return equivalent_names
def generate_annotation_file(from_reference_sammple_file_path: str, destination_file_path: str): def concat_intervals(e): intervals = e.xpath(".//INSDInterval") intervals2 = [] for i in intervals: start = text_at_node(i, './/INSDInterval_from') stop = text_at_node(i, './/INSDInterval_to') intervals2.append((start, stop)) if intervals: return ';'.join([','.join(pair) for pair in intervals2]) else: return None, None sample_xml: ElementTree = etree.parse( from_reference_sammple_file_path, parser=etree.XMLParser(remove_blank_text=True)) features_nodes = sample_xml.xpath( '/INSDSet/INSDSeq/INSDSeq_feature-table/INSDFeature') annotations = [] for a_feature in features_nodes: try: # get chromosome chromosmes = a_feature.xpath( 'INSDFeature_intervals/INSDInterval/INSDInterval_accession') chromosome_name = text_at_node(chromosmes[0], '.', mandatory=True) # warn if more than one chromosome for c in chromosmes: if text_at_node(c, '.', mandatory=True) != chromosome_name: logger.warning( f'different chromosome names found while generating {destination_file_path}' ) # interval position start_stop_string = concat_intervals(a_feature) # feature type (CDS/ UTR / etc.) feature_type = text_at_node(a_feature, './/INSDFeature_key') or '.' if feature_type == 'source': continue feature_type = feature_type.replace('mat_peptide', 'mature_protein_region') # gene gene_name = text_at_node( a_feature, './/INSDQualifier[./INSDQualifier_name/text() = "gene"]/INSDQualifier_value', False) or '.' gene_name = gene_name.replace('orf', 'ORF') # protein product = text_at_node( a_feature, './/INSDQualifier[./INSDQualifier_name/text() = "product"]/INSDQualifier_value', False) or '.' product = product.replace('orf', 'ORF') # AA sequence (one of translation or peptide) translation = text_at_node( a_feature, './/INSDQualifier[./INSDQualifier_name/text() = "translation"]/INSDQualifier_value', False) peptide = text_at_node( a_feature, './/INSDQualifier[./INSDQualifier_name/text() = "peptide"]/INSDQualifier_value', False) amino_acid_sequence = translation or peptide or '.' # protein ID protein_id = text_at_node( a_feature, './/INSDQualifier[./INSDQualifier_name/text() = "protein_id"]/INSDQualifier_value', False) or '.' annotations.append( (chromosome_name, 'RefSeq', feature_type, start_stop_string, gene_name, product, protein_id, amino_acid_sequence)) except AssertionError as e: pass # filter annotations (remove duplicates) annotations_copy = [] removed = [] try: for i in range(len(annotations)): # decide which annotations to consider do_not_add = False a = annotations[i] # pick one annotation # separate start_stop_string a_start = a[3][:a[3].index(',')] a_stop = a[3][a[3].rindex(',') + 1:] # check if in the following annotations, there is one having the same start and stop coordinates for j in range(i + 1, len(annotations)): a2 = annotations[j] a2_start = a2[3][:a2[3].index(',')] a2_stop = a2[3][a2[3].rindex(',') + 1:] # print(f"a: {a[3]} -> {a_start} - {a_stop} vs a2: {a2[3]} -> {a2_start} - {a2_stop}") # if same coordinates and same gene: # ignore this one if the other one has same protein name and same AA sequence # (this is necessary because there are identical annotations (e.g. of mature protein region) except for # the protein_id.) if a_start == a2_start and a_stop == a2_stop and a[4] == a2[4]: if a[5] == a2[5] and a[7] == a2[7]: do_not_add = True removed.append(a) if not do_not_add: annotations_copy.append(a) except ValueError: print('ANNOTATIONS') for a in annotations: print(*a, sep='\t', end='\n') print('\n\n') print('ANNOTATIONS COPY') for a in annotations_copy: print(*a, sep='\t', end='\n') print('\n\n') print('TO REMOVE') print(*removed) except IndexError: logger.exception( f"len annotations: {len(annotations)}, i: {i}, j: {j}") sorted(annotations_copy, key=lambda tup: tup[3]) # for a in annotations_copy: # print(*a, sep='\t', end='\n') # print('\n\n') # for a in removed: # print(*a, sep='\t', end='\n') with open(destination_file_path, mode='w') as ann_file: for a in annotations_copy: line = '\t'.join(a) ann_file.write(line + '\n')
def species(self): # species_taxon_id = text_at_node(self.tax_tree, './/LineageEx/Taxon[./Rank/text() = "species"]/TaxId') return text_at_node( self.tax_tree, './/LineageEx/Taxon[./Rank/text() = "species"]/ScientificName', mandatory=False)
def genus(self): return text_at_node( self.tax_tree, './/LineageEx/Taxon[./Rank/text() = "genus"]/ScientificName')
def sub_family(self): return text_at_node( self.tax_tree, './/LineageEx/Taxon[./Rank/text() = "subfamily"]/ScientificName')
def taxon_name(self): return text_at_node(self.tax_tree, './Taxon/ScientificName')
def species(self): return text_at_node(self.tax_tree, './/LineageEx/Taxon[./Rank/text() = "species"]/ScientificName', mandatory=False) \ or self.suggested_from_other_method.get('species')
def genus(self): return text_at_node(self.tax_tree, './/LineageEx/Taxon[./Rank/text() = "genus"]/ScientificName') \ or self.suggested_from_other_method.get('genus')
def taxon_id(self): return text_at_node(self.tax_tree, './TaxId', mandatory=True)