Beispiel #1
0
    def concat_intervals(e):
        intervals = e.xpath(".//INSDInterval")
        intervals2 = []
        for i in intervals:
            start = text_at_node(i, './/INSDInterval_from')
            stop = text_at_node(i, './/INSDInterval_to')
            intervals2.append((start, stop))

        if intervals:
            return ';'.join([','.join(pair) for pair in intervals2])
        else:
            return None, None
Beispiel #2
0
def get_fasta_list():
    get_list_attempts = 3
    page = None
    while get_list_attempts > 0:
        try:
            page = requests.get(base_url)
            break
        except Exception as e:
            if get_list_attempts == 1:
                logger.exception(
                    'Attempt to fetch the list of fasta files continues to fail.'
                )
                raise e
            logger.error(
                'Attept to fetch the list of fasta files failed. New attmept in 10 seconds.'
            )
            sleep(10)
            get_list_attempts -= 1
    tree = html.fromstring(page.content)
    file_list = tree.xpath('/html/body/pre/a')
    file_list = file_list[1:]  # removes '../'
    file_list = [
        text_at_node(a_node, '.', mandatory=True) for a_node in file_list
    ]
    file_list = [item for item in file_list if item.startswith('NMDC')
                 ]  # keep only the sequences of SARS-COV
    return file_list
Beispiel #3
0
    def do():
        # download and write the taxonomy tree for the taxon id
        destination_file_path = f"{containing_directory}{sep}{taxon_name}.xml"
        if not exists(destination_file_path):
            # get taxon_id
            with Entrez.esearch(db="taxonomy",
                                term=f'"{taxon_name}"',
                                rettype=None,
                                retmode="xml",
                                tool=entrez_config[0],
                                email=entrez_config[1],
                                api_key=entrez_config[2]) as id_search:
                tree: etree.ElementTree = etree.parse(
                    source=id_search,
                    parser=etree.XMLParser(remove_blank_text=True))
                taxon_id = text_at_node(tree,
                                        '/eSearchResult/IdList/Id',
                                        mandatory=True)

            # download data for taxon_id
            with Entrez.efetch(db="taxonomy",
                               id=taxon_id,
                               rettype=None,
                               retmode="xml",
                               tool=entrez_config[0],
                               email=entrez_config[1],
                               api_key=entrez_config[2]) as handle:
                with open(destination_file_path, 'w') as f:
                    f.write(handle.read())
        return destination_file_path
Beispiel #4
0
 def __init__(self, xml_tree_file_path: str):
     self.tax_tree: etree.ElementTree = \
         etree.parse(xml_tree_file_path, parser=etree.XMLParser(remove_blank_text=True)) \
         .xpath('/TaxaSet/Taxon')[0]
     rank = self.tax_tree.xpath(
         './Rank')  # xpath returns a list also for single nodes
     if rank:
         self.suggested_from_other_method[text_at_node(
             rank[0], '.').lower()] = self.taxon_name()
Beispiel #5
0
 def equivalent_names(self):
     genbank_acronym = text_at_node(self.tax_tree,
                                    './/GenbankAcronym',
                                    mandatory=False)
     equivalent_names = self.tax_tree.xpath('.//EquivalentName')
     equivalent_names = [x.text for x in equivalent_names]
     if genbank_acronym:
         equivalent_names.insert(0, genbank_acronym)
     equivalent_names = list(OrderedDict.fromkeys(equivalent_names))
     equivalent_names = ", ".join(equivalent_names)
     return equivalent_names
Beispiel #6
0
def generate_annotation_file(from_reference_sammple_file_path: str,
                             destination_file_path: str):
    def concat_intervals(e):
        intervals = e.xpath(".//INSDInterval")
        intervals2 = []
        for i in intervals:
            start = text_at_node(i, './/INSDInterval_from')
            stop = text_at_node(i, './/INSDInterval_to')
            intervals2.append((start, stop))

        if intervals:
            return ';'.join([','.join(pair) for pair in intervals2])
        else:
            return None, None

    sample_xml: ElementTree = etree.parse(
        from_reference_sammple_file_path,
        parser=etree.XMLParser(remove_blank_text=True))
    features_nodes = sample_xml.xpath(
        '/INSDSet/INSDSeq/INSDSeq_feature-table/INSDFeature')
    annotations = []
    for a_feature in features_nodes:
        try:
            # get chromosome
            chromosmes = a_feature.xpath(
                'INSDFeature_intervals/INSDInterval/INSDInterval_accession')
            chromosome_name = text_at_node(chromosmes[0], '.', mandatory=True)
            # warn if more than one chromosome
            for c in chromosmes:
                if text_at_node(c, '.', mandatory=True) != chromosome_name:
                    logger.warning(
                        f'different chromosome names found while generating {destination_file_path}'
                    )
            # interval position
            start_stop_string = concat_intervals(a_feature)
            # feature type (CDS/ UTR / etc.)
            feature_type = text_at_node(a_feature, './/INSDFeature_key') or '.'
            if feature_type == 'source':
                continue
            feature_type = feature_type.replace('mat_peptide',
                                                'mature_protein_region')
            # gene
            gene_name = text_at_node(
                a_feature,
                './/INSDQualifier[./INSDQualifier_name/text() = "gene"]/INSDQualifier_value',
                False) or '.'
            gene_name = gene_name.replace('orf', 'ORF')
            # protein
            product = text_at_node(
                a_feature,
                './/INSDQualifier[./INSDQualifier_name/text() = "product"]/INSDQualifier_value',
                False) or '.'
            product = product.replace('orf', 'ORF')
            # AA sequence (one of translation or peptide)
            translation = text_at_node(
                a_feature,
                './/INSDQualifier[./INSDQualifier_name/text() = "translation"]/INSDQualifier_value',
                False)
            peptide = text_at_node(
                a_feature,
                './/INSDQualifier[./INSDQualifier_name/text() = "peptide"]/INSDQualifier_value',
                False)
            amino_acid_sequence = translation or peptide or '.'
            # protein ID
            protein_id = text_at_node(
                a_feature,
                './/INSDQualifier[./INSDQualifier_name/text() = "protein_id"]/INSDQualifier_value',
                False) or '.'

            annotations.append(
                (chromosome_name, 'RefSeq', feature_type, start_stop_string,
                 gene_name, product, protein_id, amino_acid_sequence))
        except AssertionError as e:
            pass

    # filter annotations (remove duplicates)
    annotations_copy = []
    removed = []

    try:
        for i in range(len(annotations)):
            # decide which annotations to consider
            do_not_add = False
            a = annotations[i]  # pick one annotation
            # separate start_stop_string
            a_start = a[3][:a[3].index(',')]
            a_stop = a[3][a[3].rindex(',') + 1:]
            # check if in the following annotations, there is one having the same start and stop coordinates
            for j in range(i + 1, len(annotations)):
                a2 = annotations[j]
                a2_start = a2[3][:a2[3].index(',')]
                a2_stop = a2[3][a2[3].rindex(',') + 1:]
                # print(f"a: {a[3]} ->  {a_start} - {a_stop} vs a2: {a2[3]} -> {a2_start} - {a2_stop}")

                # if same coordinates and same gene:
                #   ignore this one if the other one has same protein name and same AA sequence
                #   (this is necessary because there are identical annotations (e.g. of mature protein region) except for
                #   the protein_id.)
                if a_start == a2_start and a_stop == a2_stop and a[4] == a2[4]:
                    if a[5] == a2[5] and a[7] == a2[7]:
                        do_not_add = True
                        removed.append(a)
            if not do_not_add:
                annotations_copy.append(a)
    except ValueError:
        print('ANNOTATIONS')
        for a in annotations:
            print(*a, sep='\t', end='\n')
        print('\n\n')

        print('ANNOTATIONS COPY')
        for a in annotations_copy:
            print(*a, sep='\t', end='\n')
        print('\n\n')

        print('TO REMOVE')
        print(*removed)
    except IndexError:
        logger.exception(
            f"len annotations: {len(annotations)}, i: {i}, j: {j}")

    sorted(annotations_copy, key=lambda tup: tup[3])

    # for a in annotations_copy:
    #     print(*a, sep='\t', end='\n')
    # print('\n\n')
    # for a in removed:
    #     print(*a, sep='\t', end='\n')

    with open(destination_file_path, mode='w') as ann_file:
        for a in annotations_copy:
            line = '\t'.join(a)
            ann_file.write(line + '\n')
Beispiel #7
0
 def species(self):
     # species_taxon_id = text_at_node(self.tax_tree, './/LineageEx/Taxon[./Rank/text() = "species"]/TaxId')
     return text_at_node(
         self.tax_tree,
         './/LineageEx/Taxon[./Rank/text() = "species"]/ScientificName',
         mandatory=False)
Beispiel #8
0
 def genus(self):
     return text_at_node(
         self.tax_tree,
         './/LineageEx/Taxon[./Rank/text() = "genus"]/ScientificName')
Beispiel #9
0
 def sub_family(self):
     return text_at_node(
         self.tax_tree,
         './/LineageEx/Taxon[./Rank/text() = "subfamily"]/ScientificName')
Beispiel #10
0
 def taxon_name(self):
     return text_at_node(self.tax_tree, './Taxon/ScientificName')
Beispiel #11
0
 def species(self):
     return text_at_node(self.tax_tree, './/LineageEx/Taxon[./Rank/text() = "species"]/ScientificName', mandatory=False) \
            or self.suggested_from_other_method.get('species')
Beispiel #12
0
 def genus(self):
     return text_at_node(self.tax_tree, './/LineageEx/Taxon[./Rank/text() = "genus"]/ScientificName') \
            or self.suggested_from_other_method.get('genus')
Beispiel #13
0
 def taxon_id(self):
     return text_at_node(self.tax_tree, './TaxId', mandatory=True)