def read_taxonomy(tax_f):
    root = TNode('*', [])
    tax = Taxonomy(tax_f, root)

    with open(tax_f) as f:
        for line in f:
            node_name, ph_str = line.strip('\r\n').split('\t')
            node = TNode(node_name, ph_str.split(','))
            tax.add_node(node)

    return tax
Beispiel #2
0
def show_lines_not_found_in_taxonomy(double_translated, taxonomy: Taxonomy):
    # ['Group', 'CommonName', 'Rare', 'Total', 'TaxonOrder']

    for local_name, _ in double_translated:
        row = taxonomy.find_local_name_row(local_name)
        if row is None:
            print(f'Not found in taxonomy: {local_name}')
def main():
    import argparse
    usage   = "%(prog)s -v" #usage=usage, 
    parser  = argparse.ArgumentParser(description=desc, epilog=epilog, \
                                      formatter_class=argparse.RawTextHelpFormatter)
  
    parser.add_argument('--version', action='version', version='1.0b')   
    parser.add_argument("-v", "--verbose", default=False, action="store_true",
                        help="verbose")
    parser.add_argument('-t', '--taxids', nargs="+", type=int,
                        help="group taxid(s)    [%(default)s]")
    parser.add_argument("--taxadb",        default="/users/tg/lpryszcz/cluster/rapsi/taxonomy.db3",
                        help="taxonomy path  [%(default)s]")

    o = parser.parse_args()
    if o.verbose:
        sys.stderr.write("Options: %s\n"%str(o))
        
    #init taxonomy
    taxa = Taxonomy(o.taxadb)

    #init metaphors connection
    cur = _getConnection()
    cur.execute("select taxid, name from species")
    species = {}
    for taxid, name in cur.fetchall():
        species[taxid] = (taxid, name)
    
    if o.verbose:
        sys.stderr.write("%s species in database\n"%len(species))
        
    #process taxa groups
    for taxid in o.taxids:
        #fetch proteins from given taxa
        taxid2proteomes(cur, species, taxa, taxid, o.verbose)
Beispiel #4
0
def main():
    import argparse
    usage   = "%(prog)s -v" #usage=usage, 
    parser  = argparse.ArgumentParser(description=desc, epilog=epilog, \
                                      formatter_class=argparse.RawTextHelpFormatter)
  
    parser.add_argument('--version', action='version', version='1.0b')   
    parser.add_argument("-v", "--verbose", default=False, action="store_true",
                        help="verbose")
    parser.add_argument('-d', '--db', default="metaphors_201405",
                        help="database name     [%(default)s]")
    parser.add_argument('-t', '--taxids', nargs="+", type=int,
                        help="group taxid(s)    [%(default)s]")
    parser.add_argument("--taxadb",        default="/users/tg/lpryszcz/cluster/rapsi/taxonomy.db3",
                        help="taxonomy path  [%(default)s]")

    o = parser.parse_args()
    if o.verbose:
        sys.stderr.write("Options: %s\n"%str(o))
        
    #init taxonomy
    taxa = Taxonomy(o.taxadb)

    #init metaphors connection
    m  = dbClient.metaphors(o.db)
    if o.verbose:
        sys.stderr.write("%s species in %s database\n"%(len(m.species), o.db))
        
    #process taxa groups
    for taxid in o.taxids:
        #fetch proteins from given taxa
        taxid2proteomes(m, taxa, taxid, o.verbose)
def create_row_for_missing_species(
        common_name: str, summary: pd.DataFrame,
        taxonomy: Taxonomy) -> Optional[Tuple[pd.Series, bool]]:
    # can also be SPUH, ISSF etc., just something that wasn't on official list
    # The number of columns may vary based on the checklist, but we fill
    # in the ones that we know must be there
    taxonomy_row = taxonomy.find_local_name_row(common_name)
    if taxonomy_row is None:  # i.e. not found, drop it
        return None

    new_row = pd.Series([''] * len(summary.columns), index=summary.columns)
    new_row['Group'] = taxonomy_row.SPECIES_GROUP
    new_row['CommonName'] = common_name
    new_row['TaxonOrder'] = taxonomy_row.TAXON_ORDER
    new_row['NACC_SORT_ORDER'] = taxonomy_row.NACC_SORT_ORDER
    new_row['ABA_SORT_ORDER'] = taxonomy_row.ABA_SORT_ORDER
    new_row['Category'] = taxonomy_row.Category
    # Filled in later. This is the "Grand Total", not the total from an individual checklist
    new_row['Total'] = 0

    # Not on official list, so mark it Rare if it's a species (not SPUH etc.)
    rarity = taxonomy_row.Category == 'species'
    if rarity:
        new_row['Rare'] = 'X'

    return new_row, rarity
def create_category_column(summary: pd.DataFrame, taxonomy: Taxonomy) -> list:
    categories = []
    for common_name in summary.CommonName.values:
        taxonomy_row = taxonomy.find_local_name_row(common_name)
        category = '' if taxonomy_row is None else taxonomy_row.Category
        categories.append(category)

    return categories
def filter_additional_rare(taxonomy: Taxonomy,
                           additional_rare: List[str]) -> List[str]:
    rare_species = []
    for cn in additional_rare:
        row = taxonomy.find_local_name_row(cn)
        if row is not None and row.Category == 'species':
            rare_species.append(cn)

    return rare_species
Beispiel #8
0
    def load_from_json(cls, name):
        """
        Loads a taxonomy object from the given json file and name.
        """

        name_with_extension = str(name) + ".json"
        file_name = "taxonomies/" + name_with_extension

        with open(file_name) as file_object:
            graph_json = json.load(file_object)

        graph = json_graph.node_link_graph(graph_json)
        return Taxonomy(graph)
Beispiel #9
0
def clean_common_names(
        common_names: List[str], taxonomy: Taxonomy,
        local_translation_context: LocalTranslationContext) -> List[str]:
    # skip tertiary_transformation() for now
    common_names = [
        secondary_species_processing(pre_process_line(line))
        for line in common_names
    ]

    #  text_list = [tertiary_transformation(secondary_species_processing(pre_process_line(line))) \
    #                for line in text_list]

    # # Processing 1 checklist here
    # tti = TaxonomyTokenIdentify(taxonomy, interim_data_path)
    #
    # # use text_list from above
    # text_list_lower = [x.lower() for x in text_list]
    # possibles = filter_to_possibles(tti, text_list_lower)
    # print(f'Possible species lines: {len(possibles)} (based on word intersections)')

    # Double translate
    # print('Doing double translation')  # Can take a while
    translated = []
    for line in common_names:  # was: possibles
        txline = local_translation_context.apply_translations(
            line.lower(), True)
        translated.append(txline)

    double_translated = []
    for line, _ in translated:
        txline2 = local_translation_context.apply_translations(
            line.lower(), True)
        double_translated.append(txline2)

    double_translated = [x for (x, y) in double_translated]
    # print(double_translated)

    # they may be all lower case, return proper capitalization
    result = []
    for common_name in double_translated:
        xcn = ''
        if common_name != '':  # avoid most common exception
            try:
                row = taxonomy.find_local_name_row(common_name)
                xcn = row.comName
            except AttributeError as ae:
                print(ae)
                print(f'no taxonomy entry for "{common_name}"')
        result.append(xcn)

    return result
Beispiel #10
0
    def read(self, parsed_taxonomies):
        """
        Read in taxonomies for a given code table.

        params:
            taxonomies (dict{id: Taxonomy})
        """
        for key, taxonomy in parsed_taxonomies.iteritems():
            synonym_phrases = [
                Phrase(synonym) for synonym in taxonomy.synonyms
            ]

            head_phrase = Phrase(taxonomy.head)
            self.taxonomies[key] = Taxonomy(key, head_phrase, synonym_phrases)
Beispiel #11
0
    def taxonomies_to_str(self):
        """
        Convert taxonomies to strings, to be written to a file.

        returns:
            taxonomies_as_str {dict:Taxonomy(str)}
        """
        taxonomies_as_str = {}
        for key, taxonomy in self.taxonomies.iteritems():
            taxonomy_head = taxonomy.head.raw_form
            synonyms = [synonym.raw_form for synonym in taxonomy.synonyms]

            taxonomies_as_str[key] = Taxonomy(key, taxonomy_head, synonyms)

        return taxonomies_as_str
Beispiel #12
0
def strip_off_scientific_names(text_list: List[str],
                               taxonomy: Taxonomy) -> List[str]:
    # The CAMP-2020 checklist has <Common Name> <Scientific Name>
    # Assume all scientific names are two words and drop
    stripped_text_list = []
    for line in text_list:
        line = line.strip()
        # e.g. line = 'California Quail Callipepla californica'
        words = line.split(' ')
        if len(words) > 2:
            sci_name = ' '.join(words[-2:]).lower()
            row = taxonomy.find_scientific_name_row(sci_name)
            if row is not None:
                line = ' '.join(words[:-2])  #.lower()
        stripped_text_list.append(line)

    return stripped_text_list
Beispiel #13
0
def find_cites(contribs):
    tax = Taxonomy()
    successes = 0
    failures = 0
    citations = []
    for contrib in contribs:
        row = None
        if "Rationale" in contrib.keys():
            row = contrib["Rationale"]
        if "Text" in contrib.keys():
            row = contrib["Text"]
        if row is not None:
            cites = search_for_citations(tax, row)
            entry = {"ID": contrib["ID"], "Citations": cites}
            citations.append(entry)
            successes += 1
        else:
            failures += 1
    return citations
Beispiel #14
0
def csv_dataframe_to_checklist(
        checklist: pd.DataFrame, taxonomy: Taxonomy,
        local_translation_context: LocalTranslationContext, observer_name: str,
        xdates: List[str]) -> Optional[pd.DataFrame]:
    # Use column names from eBird and let them be fixed by transform_checklist_details
    # These all have to be present for transform_checklist_details

    if set(checklist.columns) & {'CommonName', 'Total'} == set():
        return None

    cleaned_common_names = clean_common_names(checklist.CommonName, taxonomy,
                                              local_translation_context)
    checklist.CommonName = cleaned_common_names

    # This will get switched back by transform_checklist_details
    checklist.rename(columns={'Total': 'howManyStr'}, inplace=True)
    xdtypes = {'CommonName': str, 'howManyStr': int}
    checklist = checklist.astype(dtype=xdtypes)

    checklist['speciesCode'] = [
        taxonomy.find_species6_ebird(cn) for cn in checklist.CommonName
    ]
    checklist['locId'] = 'L5551212'
    checklist['subId'] = 'S5551212'
    checklist['groupId'] = ''
    checklist['durationHrs'] = 0.5
    checklist['effortDistanceKm'] = 0.1
    checklist['effortDistanceEnteredUnit'] = 'mi'
    # 'obsDt' needs dates in this form '26 Dec 2020'
    obsdt = normalize_date_for_visits(xdates[0])
    checklist['obsDt'] = f'{obsdt} 12:01'

    checklist['userDisplayName'] = observer_name
    checklist['numObservers'] = 1
    checklist['comments'] = 'Generated'

    # Clean up
    checklist = transform_checklist_details(checklist, taxonomy)

    return checklist
Beispiel #15
0
 def set_target_taxonomy_by_string(self, taxonomy_string):
     '''Set the target_taxonomy instance variable by a string, which
     gets parsed into the requisite array form and stored in the instance
     variable'''
     self.target_taxonomy = Taxonomy.split_taxonomy(taxonomy_string)
Beispiel #16
0
NOUNSET_BANK = NounSetBank(DATA_DIR + 'nounsets.yml')
NOUN_FORMS = {
    lang: NounFormBank(DATA_DIR + 'nouns_{}.yml'.format(lang))
    for lang in LANGUAGES
}

PREPSET_BANK = PrepositionSetBank(DATA_DIR + 'prepsets.yml')

PRONSET_BANK = PronounSetBank(DATA_DIR + 'pronsets.yml')
PRONOUN_FORMS = {
    lang: PronounFormBank(DATA_DIR + 'prons_{}.yml'.format(lang))
    for lang in LANGUAGES
}

TAXONOMY = Taxonomy(DATA_DIR + 'taxonomy.yml')

#VERBSET_BANK = VerbSetBank(DATA_DIR + 'verbsets/')
VERBSET_BANK = VerbSetBank(DATA_DIR + 'verbsets.yml')
VERB_FORMS = {
    lang: VerbFormBank(DATA_DIR + 'verbs_{}.yml'.format(lang))
    for lang in LANGUAGES
}

TEMPLATE_DIR = DATA_DIR + 'templates/'
ADJP_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'adjp_templates.yml')
ADVP_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'advp_templates.yml')
CLAUSE_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'clause_templates.yml')
#CUSTOM_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'custom_templates.yml')
CUSTOM_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'custom_postedited.yml')
NP_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'np_templates.yml')
Beispiel #17
0
    def main(self, tree_filename, tree_format='newick', ids=None):
        col_delimiter = '\t|\t'
        row_delimiter = '\t|\n'
        url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'

        # download the taxonomy archive
        filename = self.download_file(url)

        # extract the text dump
        for extract in ('nodes.dmp', 'names.dmp'):
            if os.path.exists(os.path.join(self.data_dir, extract)):
                print 'Using existing copy of %s' % extract
            else:
                print 'Extracting %s from %s...' % (extract, filename)
                archive = tarfile.open(name=filename, mode='r:gz')
                archive.extract(extract, path=self.data_dir)
                archive.close()

        # get names for all tax_ids from names.dmp
        print 'Getting names...'
        scientific_names = {}
        other_names = defaultdict(set)
        with open(os.path.join(self.data_dir, 'names.dmp')) as names_file:
            for line in names_file:
                line = line.rstrip(row_delimiter)
                values = line.split(col_delimiter)
                tax_id, name_txt, _, name_type = values[:4]
                if name_type == 'scientific name':
                    scientific_names[tax_id] = name_txt
                else:
                    other_names[tax_id].add(name_txt)

        # read all node info from nodes.dmp
        print 'Reading taxonomy...'
        nodes = {}
        with open(os.path.join(self.data_dir, 'nodes.dmp')) as nodes_file:
            for line in nodes_file:
                line = line.rstrip(row_delimiter)
                values = line.split(col_delimiter)
                tax_id, parent_id = values[:2]
                if ids:
                    this_node = BaseTree.Clade(name=tax_id)
                else:
                    this_node = BaseTree.Clade(name=scientific_names[tax_id])

                nodes[tax_id] = this_node
                this_node.parent_id = parent_id

                if tree_format == 'cdao':
                    # add common names, synonyms, mispellings, etc. as skos:altLabels
                    if not hasattr(this_node, 'tu_attributes'):
                        this_node.tu_attributes = []
                    for x in other_names[tax_id]:
                        this_node.tu_attributes.append(
                            ('<http://www.w3.org/2004/02/skos/core#altLabel>',
                             Taxonomy.format_rdf_string(x)))

        print 'Found %s OTUs.' % len(nodes)

        # create tree from nodes dictionary
        print 'Building tree...'
        for node_id, this_node in nodes.iteritems():
            if node_id == this_node.parent_id:
                root_node = this_node
                print 'Found root.'
            else:
                parent_node = nodes[this_node.parent_id]
                parent_node.clades.append(this_node)

            del this_node.parent_id

        tree = BaseTree.Tree(root=root_node)

        # write tree to file
        print 'Writing %s tree to %s...' % (tree_format, tree_filename)
        bp.write([tree], tree_filename, tree_format)

        print 'Done!'
Beispiel #18
0
def main():
    logger.warning("Start building taxonomy")
    # Load input: this includes reading network, text, and
    # a background corpus for contrastive analysis
    logger.info("Loading graph from file")
    A, node_info = utils.load_graph(args.data_dir,
                                    remove_citation=True,
                                    force_undirected=True)
    logger.info("Create HIN")
    G = HIN(A, node_info)

    logger.info("Load text")
    corpus = utils.load_documents(args.data_dir)

    motif_matchers = [
        Motif_KPV(),
        Motif_KPA(),
        Motif_KP(),
        Motif_KPVY(),
        Motif_KPAA()
    ]

    intermediate_dir = plib.Path(args.data_dir, "intermediate")
    if not intermediate_dir.is_dir():
        logger.warning(f"Creating intermediate dir {intermediate_dir}")
        intermediate_dir.mkdir(parents=False)

    # we collect all phrases
    T = []  # terms / phrases
    for info in node_info.values():
        if info.node_type == "K":
            T.append(info.entity_id)

    D = corpus
    tf_bg, idf_bg = utils.get_tf_idf_from_file(
        plib.Path(args.data_dir, "background_documents.txt"), T)

    taxo = Taxonomy(D, T, G)

    builder = NetTaxo(motif_matchers,
                      tf_lift=args.tf_lift,
                      idf_lift=args.idf_lift,
                      damping=args.damping,
                      conf_motif=Motif_KPA().motif_name)

    # set background corpus for contrastive analysis
    builder.set_background(tf_bg, idf_bg)
    builder.build(taxo, args.levels)

    # save
    output_dir = plib.Path(args.output_dir, config.unique_id)
    if not output_dir.is_dir():
        output_dir.mkdir(parents=True)
    logger.info(f"Saving to {output_dir}")
    taxo.save(output_dir)

    logger.info("Saving complete")

    # generate output
    taxo.visualize(plib.Path(output_dir, f"vis.pdf"))
    taxo.save_readable(output_dir)
Beispiel #19
0
def merge_checklists(
    summary_base: Any, sector_files: List[Any],
    stem_to_colname: Union[dict, List[str]], taxonomy: Taxonomy,
    local_translation_context: LocalTranslationContext
) -> Tuple[pd.DataFrame, List[str], List[str]]:
    # Easier to use single column summary_base, but this will transform it if needed
    if isinstance(summary_base, Path):
        template = read_excel_or_csv_path(summary_base)
        # Create a single column master for summary
        summary_base = recombine_transformed_checklist(template, taxonomy)
    elif isinstance(summary_base, pd.DataFrame):
        summary_base = summary_base

    base_has_adult_col = 'Ad' in summary_base.columns
    base_has_immature_col = 'Im' in summary_base.columns
    has_adult_col = False
    has_immature_col = False

    # Start of big processing loop
    summary = summary_base.copy()
    sector_unique = 1
    sector_cols = []
    for idx, fpath in enumerate(sector_files):
        try:
            if isinstance(fpath, Path):
                sector_col = stem_to_colname.get(fpath.stem, None)
            else:
                sector_col = stem_to_colname[idx]
        except Exception as ee:
            print(ee, idx, fpath)
            sector_col = None

        if not sector_col:
            sector_col = f'X{sector_unique}'
            sector_unique += 1

        sector_cols.append(sector_col)
        print(f'Processing {sector_col}')

        summary_common_names = summary.CommonName.values
        summary_common_names_lower = [
            xs.lower() for xs in summary_common_names
        ]
        if isinstance(fpath, Path):
            checklist = read_excel_or_csv_path(fpath)
            # Only Excel files would be double column. CSV files could be hand made,
            # so clean them up. Double translation takes a long time, so avoid when
            # possible
            if fpath.suffix == '.xlsx':
                checklist = recombine_transformed_checklist(
                    checklist, taxonomy)
            else:
                cleaned_common_names = clean_common_names(
                    checklist.CommonName, taxonomy, local_translation_context)
                checklist.CommonName = cleaned_common_names
            # print(checklist.Total)
            xdtypes = {'CommonName': str, 'Total': int}
            checklist = checklist.astype(dtype=xdtypes)

            # so = pd.to_numeric(summary.NACC_SORT_ORDER, errors='coerce')
            # summary.NACC_SORT_ORDER = pd.Series(so).fillna(taxonomy.INVALID_NACC_SORT_ORDER)

        else:  # isinstance(summary_base, pd.DataFrame):
            checklist = fpath

        # Drop any rows with a blank CommonName. This can occur if the checklist is a summary
        # report with a 'Total' row at the bottom, and 'Total' is not a valid species
        checklist = checklist[checklist.CommonName != '']

        # Sector checklists may have added species not on the template
        checklist['cnlower'] = [xs.lower() for xs in checklist.CommonName]
        checklist_common_names_lower = set(
            [xs.lower() for xs in checklist.CommonName])
        names_to_add = checklist_common_names_lower - set(
            summary_common_names_lower)
        if not names_to_add == set():
            species_to_add = taxonomy.filter_species(list(names_to_add))
            if len(species_to_add) > 0:
                print(f'Added species: {species_to_add}')
            # Fix capitalization
            names_to_add = clean_common_names(list(names_to_add), taxonomy,
                                              local_translation_context)
            blank_row = pd.Series([''] * len(summary.columns),
                                  index=summary.columns)
            rows_to_add = []
            for cn in names_to_add:
                row = blank_row.copy()
                row['CommonName'] = cn
                if cn.lower() in species_to_add:
                    row['Rare'] = 'X'
                total = checklist[checklist.cnlower ==
                                  cn.lower()]['Total'].values[0]
                row[sector_col] = total
                rows_to_add.append(row)

            summary = summary.append(rows_to_add, ignore_index=True)

        #
        has_adult_col = 'Ad' in checklist.columns
        has_immature_col = 'Im' in checklist.columns

        summary[sector_col] = 0  # 'Total' field for this sector

        if has_adult_col:
            ad_col = f'Ad-{sector_col}'
            summary[ad_col] = 0

        if has_immature_col:
            im_col = f'Im-{sector_col}'
            summary[im_col] = 0

        # # S
        # # Fill in total for existing names
        # already_present_names = set(summary_common_names) & set(checklist.CommonName)
        # for cn in set(checklist.CommonName):
        #     total = checklist[checklist.CommonName == cn]['Total'].values[0]
        #     summary.loc[summary.CommonName == cn, sector_col] = total

        # print(summary.shape, len(summary_common_names_lower))
        summary_common_names_lower = [xs.lower() for xs in summary.CommonName]

        summary['cnlower'] = summary_common_names_lower
        for ix, row in checklist.iterrows():
            # if row.Total:
            #     print(row)
            total = row.FrozenTotal if 'FrozenTotal' in checklist.columns else row.Total
            mask = summary.cnlower == row.cnlower
            summary.loc[mask, sector_col] = total

        summary.drop(['cnlower'], axis=1, inplace=True)
        #     if has_adult_col:
        #         adult_total = checklist[checklist.CommonName == cn]['Ad'].values[0]
        #         summary.loc[summary.CommonName == cn, ad_col] = adult_total
        #
        #     if has_immature_col:
        #         immature_total = checklist[checklist.CommonName == cn]['Im'].values[0]
        #         summary.loc[summary.CommonName == cn, im_col] = immature_total

    # Fill in zeros for missing sector_col values; may have blanks if species added
    # for col in sector_cols:
    #     summary[col] = summary[col].apply(pd.to_numeric).fillna(0)

    # Do sums for Ad/Im columns. Ad == 'Adult/White'
    if base_has_adult_col:
        ad_cols = [xs for xs in summary.columns if xs.startswith('Ad-')]
        summary['Ad'] = summary[ad_cols].apply(
            pd.to_numeric).fillna(0).sum(axis=1).astype(int)

    if base_has_immature_col:
        im_cols = [xs for xs in summary.columns if xs.startswith('Im-')]
        summary['Im'] = summary[im_cols].apply(
            pd.to_numeric).fillna(0).sum(axis=1).astype(int)

    # Look up Group and TaxonOrder for anything missing these (may have been added species)

    for idx, row in summary.iterrows():
        record = taxonomy.find_local_name_row(row['CommonName'])
        if record is not None:
            summary.at[idx, 'TaxonOrder'] = record.TAXON_ORDER
            summary.at[idx, 'Group'] = record.SPECIES_GROUP
            so = record.NACC_SORT_ORDER if record.NACC_SORT_ORDER != 0 else \
                taxonomy.INVALID_NACC_SORT_ORDER
            summary.at[idx, 'NACC_SORT_ORDER'] = so
            so = record.ABA_SORT_ORDER if record.ABA_SORT_ORDER != 0 else \
                taxonomy.INVALID_NACC_SORT_ORDER
            summary.at[idx, 'ABA_SORT_ORDER'] = so
            summary.at[idx, 'Category'] = record.Category

    # Re-sort by TaxonOrder
    # Must sort before creating formulae for Total
    so = pd.to_numeric(summary.NACC_SORT_ORDER, errors='coerce')
    summary.NACC_SORT_ORDER = pd.Series(so).fillna(
        taxonomy.INVALID_NACC_SORT_ORDER)
    so = pd.to_numeric(summary.ABA_SORT_ORDER, errors='coerce')
    summary.ABA_SORT_ORDER = pd.Series(so).fillna(
        taxonomy.INVALID_NACC_SORT_ORDER)

    try:
        summary = summary.sort_values(by=['NACC_SORT_ORDER']).reset_index(
            drop=True)
    except TypeError as te:
        print(te)
        traceback.print_exc(file=sys.stdout)
        return summary

    # Now set the overall total field:
    #     sector_cols = [xs for xs in summary.columns if xs.startswith('Sector')]
    # summary['Total'] = summary[sector_cols].apply(pd.to_numeric).fillna(0).sum(axis=1).astype(int)

    col_letters = excel_columns()
    #     team_start_col = col_letters[len(base_columns)]
    std_columns = [
        'Group', 'CommonName', 'Rare', 'Total', 'Category', 'TaxonOrder',
        'NACC_SORT_ORDER', 'ABA_SORT_ORDER'
    ]
    # Filter out any missing columns
    std_columns = [col for col in std_columns if col in summary.columns]
    # team_start_col = col_letters[index_of_first_subtotal_column(summary)]
    sector_start_col = col_letters[len(std_columns)]
    sector_end_col = col_letters[len(summary.columns) - 1]
    total_formula = [
        f'=SUM(${sector_start_col}{ix}:${sector_end_col}{ix})'
        for ix in range(2, summary.shape[0] + 2)
    ]
    summary['Total'] = total_formula

    # Add last row for Total and each Sector total
    totals_row = pd.Series([''] * len(summary.columns), index=summary.columns)
    totals_row['Group'] = 'Totals'
    totals_row['TaxonOrder'] = 99999
    totals_row['NACC_SORT_ORDER'] = taxonomy.INVALID_NACC_SORT_ORDER
    totals_row['ABA_SORT_ORDER'] = taxonomy.INVALID_NACC_SORT_ORDER

    # Formula for Grand Total, e.g. =SUM($D$2:$D$245)
    total_col_letter = col_letters[std_columns.index('Total')]
    total_formula = f'=SUM(${total_col_letter}2:${total_col_letter}{summary.shape[0] + 1})'
    totals_row.Total = total_formula

    # sector_cols = [xs for xs in summary.columns if xs.startswith('Sector')]
    sector_totals = summary[sector_cols].apply(
        pd.to_numeric).fillna(0).sum(axis=0).astype(int)
    for col, st in sector_totals.items():
        totals_row[col] = st

    summary = summary.append(totals_row, ignore_index=True)

    cols_to_drop = [
        col for col in summary.columns
        if (col.startswith('Ad-') or col.startswith('Im-'))
    ]
    summary.drop(labels=cols_to_drop, axis=1, inplace=True)

    summary.rename(columns={
        'Ad': 'Adult/White',
        'Im': 'Immature/Blue'
    },
                   inplace=True)

    # Re-order columns
    # print(sector_cols)
    # print(summary.columns)

    new_col_order = [
        col for col in [
            'Group', 'CommonName', 'Rare', 'Total', 'Category', 'TaxonOrder',
            'NACC_SORT_ORDER', 'ABA_SORT_ORDER'
        ] if col in summary.columns
    ]
    new_col_order.extend(sector_cols)
    summary = summary[new_col_order]

    # Don't hide 'Rare' since this will be frequently used in a filter
    cols_to_hide = [
        'D', 'Difficulty', 'Adult', 'Immature', 'W-morph', 'B-Morph'
    ]

    if 'Adult/White' in summary.columns:
        if summary['Adult/White'].apply(pd.to_numeric).fillna(0).sum() == 0:
            cols_to_hide.append('Adult/White')
    if 'Immature/Blue' in summary.columns:
        if summary['Immature/Blue'].apply(pd.to_numeric).fillna(0).sum() == 0:
            cols_to_hide.append('Immature/Blue')

    cols_to_highlight = list(
        set(summary.columns) & {'Total', 'Adult/White', 'Immature/Blue'})

    return summary, cols_to_hide, cols_to_highlight
Beispiel #20
0
    def expand_node(self, taxo: Taxonomy, node: TaxoNode, level: int,
                    n_children: int, max_level: int):
        """Expand a taxonomy node.

    Args:
      taxo: The Taxonomy object.
      node: The node to expand.
      level: Current level of taxonomy.
      n_children: Number of children to expand to next level.
    """
        logger.info(f"Expand node {node.prefix}")
        logger.info("1. Contrast analysis for term scoring")
        if node.parent is None:
            raise RuntimeError(
                "Contrastive analysis assumes parent node is set.")
        else:
            term_scores = contrast.node_contrast_analysis(
                node,
                node.parent,
                taxo.siblings(node),
                self.tf_lift * (config.LEVEL_DECAY**level),
                self.idf_lift * (config.LEVEL_DECAY**level),
            )
        for term, old_score in term_scores.items():
            term_scores[term] = old_score * node.term_prior[term]
        node.term_scores = term_scores

        logger.debug("Top terms for this node")
        logger.debug(
            str([
                utils.strip_phrase_tags(phrase)
                for phrase in utils.take_topk(term_scores, 20)
            ]))

        logger.info(
            f"check stopping criteria, level {level} >= {max_level} is {level >= max_level}"
        )
        if level >= max_level:
            return

        logger.info("Generate motif context")
        generate_motif_context(args.data_dir, level, taxo.G, node.terms,
                               node.docs, self.motif_matchers)
        sample_motif_context(args, level, self.conf_motif)

        logger.info("2. Local embedding")
        word_embed, net_embed = loc_emb.local_embedding(node, args.data_dir)
        wv_word = word_embed.syn0
        wv_net = net_embed.syn0

        logger.info("3. Term clustering")
        logger.debug(f"#term_scores {len(term_scores)}")
        topk = min(config.N_TOP_TERMS,
                   int(config.TOP_TERMS_PCT * len(term_scores)))
        topk_terms = utils.take_topk(term_scores, topk)
        clus_labels, aligned_terms = clus.term_clustering(
            topk_terms, wv_word, n_clusters=n_children)
        clus_labels_net, aligned_terms_net = clus.term_clustering(
            topk_terms, wv_net, n_clusters=n_children)
        map_ab = clus.align_clustering(clus_labels, clus_labels_net)

        logger.info("4. Anchor phrase selection")
        # anchor phrase selection w/ intersection
        # WT_clusters = []  # term weights
        term_weights_clusters = []
        for i in range(n_children):
            # get all terms in the cluster
            clus_terms_word = set(
                clus.get_cluster_terms(clus_labels, aligned_terms, i))
            clus_terms_net = set(
                clus.get_cluster_terms(clus_labels_net, aligned_terms_net,
                                       map_ab[i]))
            clus_terms = clus_terms_word & clus_terms_net
            term_nids = _term2nid(taxo.G, clus_terms)
            # get associated documents
            D_c, weights_c = clus.get_cluster_documents(
                taxo.G, node.docs, term_nids)
            # run contrastive analysis
            tf_c, idf_c = utils.get_tf_idf(D_c, clus_terms, weights=weights_c)
            next_level = level + 1
            term_scores_c = contrast.contrast_analysis(
                tf_c, idf_c, node.tf, node.idf,
                self.tf_lift * (config.LEVEL_DECAY**next_level),
                self.idf_lift * (config.LEVEL_DECAY**next_level))
            term_weights_clusters.append(term_scores_c)
            logger.debug("Cluster {}:: ".format(i) + str([
                utils.strip_phrase_tags(phrase)
                for phrase in utils.take_topk(term_scores_c, 30)
            ]))

        logger.info("5. Motif selection")
        cluster_seeds = []
        n_seed = config.N_ANCHOR_TERMS
        for i in range(n_children):
            seed_phrases = utils.take_topk(term_weights_clusters[i], n_seed)
            cluster_seeds.append(seed_phrases)

        motif_selection_sampling(args,
                                 level,
                                 cluster_seeds,
                                 keep_ratio=config.TOP_MOTIF_PCT)

        logger.info("6. Recompute embedding")
        joint_embed = loc_emb.joint_local_embedding(node, args.data_dir)
        wv_all = joint_embed.syn0

        logger.info("7. Soft clustering")
        clus_labels, aligned_terms, vmf = clus.soft_clustering(
            topk_terms, wv_all, n_clusters=n_children)

        logger.info("8. Generate next level")
        term_prior_clusters = []
        cluster_centers = []
        for i in range(n_children):
            # get all terms in the cluster
            clus_terms = clus.get_cluster_terms(clus_labels, aligned_terms, i)
            term_nids = _term2nid(taxo.G, clus_terms)
            # get associated documents
            D_c, weights_c = clus.get_cluster_documents(
                taxo.G, node.docs, term_nids)
            # run contrastive analysis
            tf_c, idf_c = utils.get_tf_idf(D_c, clus_terms, weights=weights_c)
            term_scores_c = contrast.contrast_analysis(tf_c, idf_c, node.tf,
                                                       node.idf)
            term_prior_clusters.append(term_scores_c)
            logger.debug("Cluster {}:: ".format(i) + str([
                utils.strip_phrase_tags(phrase)
                for phrase in utils.take_topk(term_scores_c, 30)
            ]))

        # generate next level terms and documents
        # compute clustering probability
        X = []
        X_terms = []
        for term in node.terms:
            try:
                X.append(wv_all[term])
                X_terms.append(term)
            except:
                pass
        X = np.vstack(X)
        clustering_probs = clus.get_soft_cluster_probs(X, vmf.cluster_centers_,
                                                       vmf.weights_,
                                                       vmf.concentrations_)
        clustering_probs = clustering_probs.T
        for idx_c in range(n_children):
            # find words in each cluster
            terms_c = []
            term_prior_c = dict()
            for i in range(X.shape[0]):
                if clustering_probs[i, idx_c] > 2 * (1 / n_children):
                    terms_c.append(X_terms[i])
                    term_prior_c[X_terms[i]] = clustering_probs[i, idx_c]

            # find documents associated with each cluster
            ranking, clustering_probs_net = clus.populate_clustering(
                taxo.G, n_children, term_prior_clusters, damping=0.8)
            doc_prior_c = dict()
            docs_c = dict()
            for paper_id, paper_content in node.docs.items():
                nid = taxo.G.find_by_entity_id("P", paper_id)
                score = clustering_probs_net[nid, idx_c]
                if score <= 2 * (1 / n_children):
                    continue
                docs_c[paper_id] = paper_content
                doc_prior_c[paper_id] = score

            node_c = TaxoNode(node.prefix + "/{}".format(idx_c), docs_c,
                              terms_c, doc_prior_c, term_prior_c)
            curr = node_c
            node_c.set_parent(node)
            node.add_child(node_c)
Beispiel #21
0
 def taxonomy_array(self):
     return Taxonomy.split_taxonomy(self.taxonomy)
Beispiel #22
0
    def main(self, tree_filename, tree_format='newick'):
        col_delimiter = '\t'
        url = 'http://ecat-dev.gbif.org/repository/export/checklist1.zip'
        
        # download the taxonomy archive
        filename = self.download_file(url)
        
        # extract the tables
        extract = 'taxon.txt'
        if os.path.exists(os.path.join(self.data_dir, extract)):
            print 'Using existing copy of %s' % extract
        else:
            print 'Extracting %s from %s...' % (extract, filename)
            archive = zipfile.ZipFile(filename, mode='r')
            archive.extract(extract, path=self.data_dir)
            archive.close()

        # build BioPython clades
        print 'Reading taxonomy...'
        nodes = {}
        with open(os.path.join(self.data_dir, 'taxon.txt')) as taxonomy_file:
            for line in taxonomy_file:
                line = line.strip()
                values = line.split(col_delimiter)
                id, parent_id, syn_id, _, name, _, status = values[:7]
                
                # skip incertae sedis taxa
                if id == '0': continue
                
                if syn_id and not 'synonym' in status:
                    continue
                elif syn_id and 'synonym' in status:
                    if tree_format == 'cdao':
                        nodes[id] = ('synonym', name, syn_id)
                elif not syn_id:
                    nodes[id] = BaseTree.Clade(name=name)
                    nodes[id].parent_id = parent_id
        
        print 'Found %s OTUs.' % len(nodes)
        nodes[''] = root_node = BaseTree.Clade()
        
        # create tree from nodes dictionary
        print 'Building tree...'
        for node_id, this_node in nodes.iteritems():
            if not node_id: continue
            
            if isinstance(this_node, BaseTree.Clade):
                try:
                    parent_node = nodes[this_node.parent_id]
                    parent_node.clades.append(this_node)
                    del this_node.parent_id
                except (KeyError, AttributeError): pass
                
            elif this_node[0] == 'synonym':
                _, name, syn_id = this_node
                try:
                    accepted_node = nodes[syn_id]
                except KeyError: continue
                
                if not isinstance(accepted_node, BaseTree.Clade): continue
                
                if not hasattr(accepted_node, 'tu_attributes'):
                    nodes[syn_id].tu_attributes = []
                nodes[syn_id].tu_attributes.append(('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(name)))
                #print 'Synonym: %s -> %s' % (name, nodes[syn_id].name)
        
        tree = BaseTree.Tree(root=root_node)
        
        # write tree to file
        print 'Writing %s tree to %s...' % (tree_format, tree_filename)
        bp.write([tree], tree_filename, tree_format)
        
        print 'Done!'''
Beispiel #23
0
    def main(self, tree_filename, tree_format='newick'):
        col_delimiter = '|'
        url = 'http://www.itis.gov/downloads/itisMySQLTables.tar.gz'
        
        # download the taxonomy archive
        filename = self.download_file(url)
        
        # extract the tables
        for extract in ('taxonomic_units', 'longnames', 'synonym_links', 'vernaculars'):
            if os.path.exists(os.path.join(self.data_dir, extract)):
                print 'Using existing copy of %s' % extract
            else:
                print 'Extracting %s from %s...' % (extract, filename)
                archive = tarfile.open(name=filename, mode='r:gz')
                full_extract = [x for x in archive.getnames() if x.split('/')[-1] == extract][0]
                member = archive.getmember(full_extract)
                member.name = extract
                archive.extract(extract, path=self.data_dir)
                archive.close()

        # get names for all ITIS TSNs from longnames table
        print 'Getting names...'
        names = {}
        with open(os.path.join(self.data_dir, 'longnames')) as names_file:
            for line in names_file:
                line = line.strip()
                values = line.split(col_delimiter)
                tax_id, name = values
                names[tax_id] = name
        
        # read all node info from taxonomic_units
        print 'Reading taxonomy...'
        nodes = {}
        with open(os.path.join(self.data_dir, 'taxonomic_units')) as nodes_file:
            for line in nodes_file:
                line = line.strip()
                values = line.split(col_delimiter)
                
                (tax_id, usage, parent_id,
                    uncertain_parent) = [values[n] for n in (0, 10, 17, 23)]
                
                #if uncertain_parent: continue
                if not usage in ('accepted', 'valid'): continue
                
                name = names[tax_id]
                this_node = BaseTree.Clade(name=name)
                nodes[tax_id] = this_node
                this_node.parent_id = parent_id
                
        other_names = defaultdict(set)
        if tree_format == 'cdao':
            # get synonym definitions
            print 'Getting synonyms...'
            with open(os.path.join(self.data_dir, 'synonym_links')) as synonym_file:
                for line in synonym_file:
                    line = line.strip()
                    values = line.split(col_delimiter)
                    node_id, syn_id, _ = values
                    nodes[node_id] = ('synonym', names[node_id], syn_id)
            with open(os.path.join(self.data_dir, 'vernaculars')) as synonym_file:
                for line in synonym_file:
                    line = line.strip()
                    values = line.split(col_delimiter)
                    tax_id, name = values[:2]
                    other_names[tax_id].add(name)
                
        print 'Found %s OTUs.' % len(nodes)
        nodes['0'] = root_node = BaseTree.Clade()
        
        # create tree from nodes dictionary
        print 'Building tree...'
        for node_id, this_node in nodes.iteritems():
            if node_id == '0': continue
            
            if isinstance(this_node, BaseTree.Clade):
                try:
                    parent_node = nodes[this_node.parent_id]
                    parent_node.clades.append(this_node)
            
                except (KeyError, AttributeError): continue
                
                del this_node.parent_id
                
                if not hasattr(this_node, 'tu_attributes'):
                    this_node.tu_attributes = []
                for name in other_names[node_id]:
                    this_node.tu_attributes.append(('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(name)))

            elif this_node[0] == 'synonym':
                _, name, syn_id = this_node
                try:
                    accepted_node = nodes[syn_id]
                except KeyError: continue
                
                if not isinstance(accepted_node, BaseTree.Clade): continue
                
                if not hasattr(accepted_node, 'tu_attributes'):
                    nodes[syn_id].tu_attributes = []
                nodes[syn_id].tu_attributes.append(('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(name)))
                #print 'Synonym: %s -> %s' % (name, nodes[syn_id].name)
        
        tree = BaseTree.Tree(root=root_node)
        
        # write tree to file
        print 'Writing %s tree to %s...' % (tree_format, tree_filename)
        bp.write([tree], tree_filename, tree_format)
        
        print 'Done!'''
Beispiel #24
0
 def taxonomy_array(self):
     return Taxonomy.split_taxonomy(self.taxonomy)
Beispiel #25
0
    def main(self, tree_filename, tree_format='newick', ids=None):
        col_delimiter = '\t|\t'
        row_delimiter = '\t|\n'
        url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
        
        # download the taxonomy archive
        filename = self.download_file(url)
        
        # extract the text dump
        for extract in ('nodes.dmp', 'names.dmp'):
            if os.path.exists(os.path.join(self.data_dir, extract)):
                print 'Using existing copy of %s' % extract
            else:
                print 'Extracting %s from %s...' % (extract, filename)
                archive = tarfile.open(name=filename, mode='r:gz')
                archive.extract(extract, path=self.data_dir)
                archive.close()
        
        # get names for all tax_ids from names.dmp
        print 'Getting names...'
        scientific_names = {}
        other_names = defaultdict(set)
        with open(os.path.join(self.data_dir, 'names.dmp')) as names_file:
            for line in names_file:
                line = line.rstrip(row_delimiter)
                values = line.split(col_delimiter)
                tax_id, name_txt, _, name_type = values[:4]
                if name_type == 'scientific name':
                    scientific_names[tax_id] = name_txt
                else:
                    other_names[tax_id].add(name_txt)
        
        # read all node info from nodes.dmp
        print 'Reading taxonomy...'
        nodes = {}
        with open(os.path.join(self.data_dir, 'nodes.dmp')) as nodes_file:
            for line in nodes_file:
                line = line.rstrip(row_delimiter)
                values = line.split(col_delimiter)
                tax_id, parent_id = values[:2]
                if ids:
                    this_node = BaseTree.Clade(name=tax_id)
                else:
                    this_node = BaseTree.Clade(name=scientific_names[tax_id])
                
                nodes[tax_id] = this_node
                this_node.parent_id = parent_id

                if tree_format == 'cdao':
                    # add common names, synonyms, mispellings, etc. as skos:altLabels
                    if not hasattr(this_node, 'tu_attributes'):
                        this_node.tu_attributes = []
                    for x in other_names[tax_id]:
                        this_node.tu_attributes.append(('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(x)))

        
        print 'Found %s OTUs.' % len(nodes)
        
        # create tree from nodes dictionary
        print 'Building tree...'
        for node_id, this_node in nodes.iteritems():
            if node_id == this_node.parent_id:
                root_node = this_node
                print 'Found root.'
            else:
                parent_node = nodes[this_node.parent_id]
                parent_node.clades.append(this_node)
                
            del this_node.parent_id
        
        tree = BaseTree.Tree(root=root_node)
        
        # write tree to file
        print 'Writing %s tree to %s...' % (tree_format, tree_filename)
        bp.write([tree], tree_filename, tree_format)
        
        print 'Done!'
# 根据筛选出来的信息   实例化Page和Taxonomy对象
page = tk.result_list_page
tax = tk.result_list_tax



# 目标对象列表
aim_page = []
aim_tax = []

for item in page: #实例化页面
    aim_page.append(Page(item = item))
    
for item in tax:  #实例化分类
    aim_tax.append(Taxonomy(tax_name=item['type'],item=item))
    



# -----------------------------------      小艾       ------------------------------------------
xiaoai = XiaoAi()

# 将任务加入小艾的队列
for item in aim_page:
    xiaoai.add_task(item)

for item in aim_tax:
    xiaoai.add_task(item)

# 开始执行自动化生成
Beispiel #27
0
    def assign_taxonomy(self, key, output_dir, dna_region, names_file,
                        ref_taxa):

        from taxonomy import Taxonomy, consensus
        #results = uc_results
        results = {}

        try:
            self.runobj.run_status_file_h.write(
                json.dumps({'status': "STARTING_ASSIGN_TAXONOMY: " + key}) +
                "\n")
        except:
            pass
        #test_read='FI1U8LC02GEF7N'
        # open gast_file to get results
        "to Dirs"
        tagtax_terse_filename = os.path.join(output_dir, "tagtax_terse")
        tagtax_long_filename = os.path.join(output_dir, "tagtax_long")
        tagtax_terse_fh = open(tagtax_terse_filename, 'w')
        tagtax_long_fh = open(tagtax_long_filename, 'w')
        tagtax_long_fh.write("\t".join([
            "read_id", "taxonomy", "distance", "rank", "refssu_count", "vote",
            "minrank", "taxa_counts", "max_pcts", "na_pcts", "refhvr_ids"
        ]) + "\n")
        gast_file = os.path.join(output_dir, "gast" + dna_region)
        if not os.path.exists(gast_file):
            logging.info("gast:assign_taxonomy: Could not find gast file: " +
                         gast_file + ". Returning")
            return results

        for line in open(gast_file, 'r'):
            # must split on tab because last field may be empty and must be maintained as blank
            data = line.strip().split("\t")
            if len(data) == 3:
                data.append("")
            # 0=id, 1=ref, 2=dist, 3=align 4=frequency
            #if data[0]==test_read:
            #    print 'found test in gastv6 ', data[1].split('|')[0], data[2], data[3]

            read_id = data[0]
            if read_id in results:
                results[read_id].append(
                    [data[1].split('|')[0], data[2], data[3], data[4]])
            else:
                results[read_id] = [[
                    data[1].split('|')[0], data[2], data[3], data[4]
                ]]

        for line in open(names_file, 'r'):
            data = line.strip().split("\t")
            dupes = data[1].split(",")
            read_id = data[0]
            taxObjects = []
            distance = 0
            frequency = 0
            refs_for = {}

            #print 'read_id', read_id
            'assing taxonomyt method, either fake or real'
            if read_id not in results:
                results[read_id] = [
                    "Unknown", '1', "NA", '0', '0', "NA", "0;0;0;0;0;0;0;0",
                    "0;0;0;0;0;0;0;0", "100;100;100;100;100;100;100;100"
                ]
                refs_for[read_id] = ["NA"]
            else:
                'it is in results[]'
                #print 'read_id in res', read_id, results[read_id]
                #if read_id == test_read_id:
                #    print 'found ', test_read_id, results[test_read_id]
                for i in range(0, len(results[read_id])):
                    #for resultread_id in results[read_id]:
                    #print 'resread_id', results[read_id]
                    ref = results[read_id][i][0]
                    if ref in ref_taxa:
                        for tax in ref_taxa[ref]:
                            for t in tax:
                                taxObjects.append(Taxonomy(t))
                    else:
                        pass

                    if read_id in refs_for:
                        #if read_id ==test_read_id:
                        #    print '2', read_id, refs_for[test_read_id]
                        if results[read_id][i][0] not in refs_for[read_id]:
                            refs_for[read_id].append(results[read_id][i][0])
                    else:
                        #if read_id == test_read_id:
                        #    print '1', read_id, results[read_id][i][0]
                        refs_for[read_id] = [results[read_id][i][0]]

                    # should all be the same distance for the duplicates
                    distance = results[read_id][i][1]
                    frequency = results[read_id][i][3]
                #Lookup the consensus taxonomy for the array
                taxReturn = consensus(taxObjects, C.majority)

                # 0=taxObj, 1=winning vote, 2=minrank, 3=rankCounts, 4=maxPcts, 5=naPcts;
                taxon = taxReturn[0].taxstring()
                #if taxon[-3:] = ';NA':
                #    taxon = taxon[:-3]
                #tax_counter[taxon]
                rank = taxReturn[0].depth()
                #print read_id, taxon, rank, taxReturn[0], taxReturn[1]
                if not taxon: taxon = "Unknown"

                # (taxonomy, distance, rank, refssu_count, vote, minrank, taxa_counts, max_pcts, na_pcts)
                results[read_id] = [
                    taxon,
                    str(distance), rank,
                    str(len(taxObjects)),
                    str(taxReturn[1]), taxReturn[2], taxReturn[3],
                    taxReturn[4], taxReturn[5]
                ]
                #print "\t".join([read_id, taxon, str(distance), rank, str(len(taxObjects)), str(taxReturn[1]), taxReturn[2], taxReturn[3], taxReturn[4], taxReturn[5]]) + "\n"
#read_id_id taxonomy        distance        rank    refssu_count    vote    minrank taxa_counts     max_pcts        na_pcts refhvr_ids
#D4ZHLFP1:25:B022DACXX:3:1101:12919:40734 1:N:0:TGACCA|frequency:162     Bacteria;Proteobacteria;Gammaproteobacteria     0.117   class   2       100     genus   1;1;1;2;2;2;0;0 100;100;100;50;50;50;0;0        0;0;0;0;0;0;100;100     v6_CI671
#D4ZHLFP1:25:B022DACXX:3:1101:10432:76870 1:N:0:TGACCA|frequency:105     Bacteria;Proteobacteria;Gammaproteobacteria     0.017   class   1       100     class   1;1;1;0;0;0;0;0 100;100;100;0;0;0;0;0   0;0;0;100;100;100;100;100       v6_BW306

# Replace hash with final taxonomy results, for each copy of the sequence
            for d in dupes:
                # print OUT join("\t", $d, @{$results{$read_id}}, join(", ", sort @{$refs_for{$read_id}})) . "\n";
                d = d.strip()
                tagtax_long_fh.write(d + "\t" + "\t".join(results[read_id]) +
                                     "\t" +
                                     ', '.join(sorted(refs_for[read_id])) +
                                     "\n")
                tagtax_terse_fh.write(d + "\t" + results[read_id][0] + "\t" +
                                      results[read_id][2] + "\t" +
                                      results[read_id][3] + "\t" +
                                      ', '.join(sorted(refs_for[read_id])) +
                                      "\t" + results[read_id][1] + "\t" +
                                      str(frequency) + "\n")

        tagtax_terse_fh.close()
        tagtax_long_fh.close()
        return results
Beispiel #28
0
    def main(self, tree_filename, tree_format='newick'):
        col_delimiter = '|'
        url = 'http://www.itis.gov/downloads/itisMySQLTables.tar.gz'

        # download the taxonomy archive
        filename = self.download_file(url)

        # extract the tables
        for extract in ('taxonomic_units', 'longnames', 'synonym_links',
                        'vernaculars'):
            if os.path.exists(os.path.join(self.data_dir, extract)):
                print 'Using existing copy of %s' % extract
            else:
                print 'Extracting %s from %s...' % (extract, filename)
                archive = tarfile.open(name=filename, mode='r:gz')
                full_extract = [
                    x for x in archive.getnames()
                    if x.split('/')[-1] == extract
                ][0]
                member = archive.getmember(full_extract)
                member.name = extract
                archive.extract(extract, path=self.data_dir)
                archive.close()

        # get names for all ITIS TSNs from longnames table
        print 'Getting names...'
        names = {}
        with open(os.path.join(self.data_dir, 'longnames')) as names_file:
            for line in names_file:
                line = line.strip()
                values = line.split(col_delimiter)
                tax_id, name = values
                names[tax_id] = name

        # read all node info from taxonomic_units
        print 'Reading taxonomy...'
        nodes = {}
        with open(os.path.join(self.data_dir,
                               'taxonomic_units')) as nodes_file:
            for line in nodes_file:
                line = line.strip()
                values = line.split(col_delimiter)

                (tax_id, usage, parent_id,
                 uncertain_parent) = [values[n] for n in (0, 10, 17, 23)]

                #if uncertain_parent: continue
                if not usage in ('accepted', 'valid'): continue

                name = names[tax_id]
                this_node = BaseTree.Clade(name=name)
                nodes[tax_id] = this_node
                this_node.parent_id = parent_id

        other_names = defaultdict(set)
        if tree_format == 'cdao':
            # get synonym definitions
            print 'Getting synonyms...'
            with open(os.path.join(self.data_dir,
                                   'synonym_links')) as synonym_file:
                for line in synonym_file:
                    line = line.strip()
                    values = line.split(col_delimiter)
                    node_id, syn_id, _ = values
                    nodes[node_id] = ('synonym', names[node_id], syn_id)
            with open(os.path.join(self.data_dir,
                                   'vernaculars')) as synonym_file:
                for line in synonym_file:
                    line = line.strip()
                    values = line.split(col_delimiter)
                    tax_id, name = values[:2]
                    other_names[tax_id].add(name)

        print 'Found %s OTUs.' % len(nodes)
        nodes['0'] = root_node = BaseTree.Clade()

        # create tree from nodes dictionary
        print 'Building tree...'
        for node_id, this_node in nodes.iteritems():
            if node_id == '0': continue

            if isinstance(this_node, BaseTree.Clade):
                try:
                    parent_node = nodes[this_node.parent_id]
                    parent_node.clades.append(this_node)

                except (KeyError, AttributeError):
                    continue

                del this_node.parent_id

                if not hasattr(this_node, 'tu_attributes'):
                    this_node.tu_attributes = []
                for name in other_names[node_id]:
                    this_node.tu_attributes.append(
                        ('<http://www.w3.org/2004/02/skos/core#altLabel>',
                         Taxonomy.format_rdf_string(name)))

            elif this_node[0] == 'synonym':
                _, name, syn_id = this_node
                try:
                    accepted_node = nodes[syn_id]
                except KeyError:
                    continue

                if not isinstance(accepted_node, BaseTree.Clade): continue

                if not hasattr(accepted_node, 'tu_attributes'):
                    nodes[syn_id].tu_attributes = []
                nodes[syn_id].tu_attributes.append(
                    ('<http://www.w3.org/2004/02/skos/core#altLabel>',
                     Taxonomy.format_rdf_string(name)))
                #print 'Synonym: %s -> %s' % (name, nodes[syn_id].name)

        tree = BaseTree.Tree(root=root_node)

        # write tree to file
        print 'Writing %s tree to %s...' % (tree_format, tree_filename)
        bp.write([tree], tree_filename, tree_format)

        print 'Done!' ''
Beispiel #29
0
def build_full_tally_sheet(double_translated, fpath: Path, taxonomy: Taxonomy,
                           parameters: Parameters, circle_prefix: str):
    candidate_names = [x for x, y in double_translated]
    local_names = process_exceptions(candidate_names, fpath, circle_prefix)

    # if issf etc in list, then base species must be also
    issfs = taxonomy.filter_issf(local_names)
    for cn in issfs:
        base_species = taxonomy.report_as(cn)
        if base_species:
            local_names.append(base_species)

    entries = []
    for local_name in local_names:
        # common_name, taxon_order, species_group, NACC_SORT_ORDER
        record = taxonomy.find_local_name_row(local_name)
        if record is not None:
            # e.g. ('White-throated Sparrow', 31943, 'New World Sparrows', 1848.0)
            entry = (record.comName, record.TAXON_ORDER, record.SPECIES_GROUP,
                     record.NACC_SORT_ORDER, record.ABA_SORT_ORDER, '', 0
                     )  # append 'Rare', 'Total'
            entries.append(entry)

    df = pd.DataFrame(entries,
                      columns=[
                          'CommonName', 'TaxonOrder', 'Group',
                          'NACC_SORT_ORDER', 'ABA_SORT_ORDER', 'Rare', 'Total'
                      ])

    # Re-order
    cols = [
        'Group', 'CommonName', 'Rare', 'Total', 'TaxonOrder',
        'NACC_SORT_ORDER', 'ABA_SORT_ORDER'
    ]
    local_checklist = df[cols]
    local_checklist.sort_values(by='TaxonOrder', inplace=True)
    #     local_checklist.shape

    # double_translated may have duplicates
    local_checklist = local_checklist[
        ~local_checklist.duplicated(subset=['CommonName'], keep='first')]

    local_checklist = process_annotations_or_rarities(local_checklist, fpath,
                                                      circle_prefix)

    # Re-order columns
    preferred_order = [
        'Group', 'CommonName', 'Rare', 'D', 'Total', 'Ad', 'Im', 'TaxonOrder',
        'NACC_SORT_ORDER', 'ABA_SORT_ORDER', 'Difficulty', 'Adult', 'Immature',
        'W-morph', 'B-Morph', 'CountSpecial'
    ]
    newcols = [
        col for col in preferred_order if col in local_checklist.columns
    ]
    local_checklist = local_checklist[newcols]

    # Write out full tally sheet
    # circle_code = circle_prefix[0:4]
    # double_path = outputs_path / f'{circle_code}-DoubleX.xlsx'
    # write_local_checklist_with_group(local_checklist, double_path, parameters.parameters)

    return local_checklist
Beispiel #30
0
#
# TODO: list-types is missing
#

INFO = """
This the CLI for the Orange Button Core library.  Information is available at the following URL's:

Orange Button Overview: https://sunspec.org/orange-button-initiative/
Orange Button GitHUb: https://github.com/SunSpecOrangeButton
Orange Button CLI GitHub: https://github.com/SunSpecOrangeButton/core
"""

DASHES = "---------------------------------------------------------------------------------------"

taxonomy = Taxonomy()
csv = False
json = False
xml = False


def info(args):
    print(INFO)


def convert(args):

    p = Parser(taxonomy)

    ff = None
    if json:
Beispiel #31
0
""" Script used to generate ncbi_subset_tax.json

NCBI taxonomy downloaded in 2019 July 1st.
"""

from taxonomy import Taxonomy

tax = Taxonomy.from_ncbi("nodes.dmp", "names.dmp")
new_tax = tax.prune(remove=['28384', '12908', '10239', '2', '2157'] +
                    [t for t in tax.children("2759") if t != "543769"])

with open("new_tax.json", "wb") as f:
    f.write(new_tax.to_json(True))
Beispiel #32
0
def hits2taxa(input, out, db, verbose, limit=0):
    """Process fastq from input file.
    You may play with bufsize, so processes runs without waiting.
    """
    #init taxonomy
    taxa = Taxonomy(db)

    #hadle gzipped/bzip2 stream
    if input.name.endswith('.gz'):
        input = gzip.open(input.name)
    elif input.name.endswith('.bz2'):
        import bz2
        input = bz2.BZ2File(input.name)
    #get match generator
    if input == sys.stdin:
        line0 = input.readline()
        if line0.startswith('@'):
            mGenerator = get_matches_sam(input, verbose)
        else:
            mGenerator = get_matches_blast8(input, verbose)
    #get sam stream
    elif input.name.endswith(('.sam', '.sam.gz')):
        mGenerator = get_matches_sam(input, verbose)
    else:
        mGenerator = get_matches_blast8(input, verbose)

    #process reads in 1K batches?print
    if verbose:
        sys.stderr.write("[%s] Processing reads from %s ...\n" %
                         (datetime.ctime(datetime.now()), input.name))
    #get taxa and genes
    taxid2reads = {}
    taxid2matches = {}
    k = 0
    for i, (rname, hits) in enumerate(mGenerator, 1):
        if limit and i > limit:
            break
        if not rname:
            continue
        #print info
        if verbose and i % 1e4 == 1:
            sys.stderr.write(" %s parsed. %.2f%s with taxa  \r" %
                             (i, k * 100.0 / i, '%'))
        #get taxa
        taxid, matches = get_taxa(hits, taxa, verbose)
        if not taxid:
            continue
        k += 1
        if taxid not in taxid2reads:
            taxid2reads[taxid] = 0
        #store read name & genes
        taxid2reads[taxid] += 1

    #report
    if not taxid2reads:
        sys.exit("No matches found!")
    ##foreign reads
    freads = sum(reads for taxid, reads in taxid2reads.iteritems())
    header = "#name\ttaxid\treads\t%\n"
    out.write(header)
    out.write("%s\t%s\t%s\t%.2f\n" % ("unknown", "-", i - freads, 100.0 *
                                      (i - freads) / i))
    for taxid, reads in sorted(taxid2reads.iteritems(),
                               key=lambda x: x[1],
                               reverse=True)[:10]:
        out.write("%s\t%s\t%s\t%.2f\n" %
                  (taxa[taxid][1], taxid, reads, 100.0 * reads / i))
    #print summary
    sys.stderr.write("[hits2taxa] %s entries processed!\n" % (i, ))
Beispiel #33
0
    def main(self, tree_filename, tree_format='newick'):
        col_delimiter = '\t'
        url = 'http://ecat-dev.gbif.org/repository/export/checklist1.zip'

        # download the taxonomy archive
        filename = self.download_file(url)

        # extract the tables
        extract = 'taxon.txt'
        if os.path.exists(os.path.join(self.data_dir, extract)):
            print 'Using existing copy of %s' % extract
        else:
            print 'Extracting %s from %s...' % (extract, filename)
            archive = zipfile.ZipFile(filename, mode='r')
            archive.extract(extract, path=self.data_dir)
            archive.close()

        # build BioPython clades
        print 'Reading taxonomy...'
        nodes = {}
        with open(os.path.join(self.data_dir, 'taxon.txt')) as taxonomy_file:
            for line in taxonomy_file:
                line = line.strip()
                values = line.split(col_delimiter)
                id, parent_id, syn_id, _, name, _, status = values[:7]

                # skip incertae sedis taxa
                if id == '0': continue

                if syn_id and not 'synonym' in status:
                    continue
                elif syn_id and 'synonym' in status:
                    if tree_format == 'cdao':
                        nodes[id] = ('synonym', name, syn_id)
                elif not syn_id:
                    nodes[id] = BaseTree.Clade(name=name)
                    nodes[id].parent_id = parent_id

        print 'Found %s OTUs.' % len(nodes)
        nodes[''] = root_node = BaseTree.Clade()

        # create tree from nodes dictionary
        print 'Building tree...'
        for node_id, this_node in nodes.iteritems():
            if not node_id: continue

            if isinstance(this_node, BaseTree.Clade):
                try:
                    parent_node = nodes[this_node.parent_id]
                    parent_node.clades.append(this_node)
                    del this_node.parent_id
                except (KeyError, AttributeError):
                    pass

            elif this_node[0] == 'synonym':
                _, name, syn_id = this_node
                try:
                    accepted_node = nodes[syn_id]
                except KeyError:
                    continue

                if not isinstance(accepted_node, BaseTree.Clade): continue

                if not hasattr(accepted_node, 'tu_attributes'):
                    nodes[syn_id].tu_attributes = []
                nodes[syn_id].tu_attributes.append(
                    ('<http://www.w3.org/2004/02/skos/core#altLabel>',
                     Taxonomy.format_rdf_string(name)))
                #print 'Synonym: %s -> %s' % (name, nodes[syn_id].name)

        tree = BaseTree.Tree(root=root_node)

        # write tree to file
        print 'Writing %s tree to %s...' % (tree_format, tree_filename)
        bp.write([tree], tree_filename, tree_format)

        print 'Done!' ''
 def set_target_taxonomy_by_string(self, taxonomy_string):
     '''Set the target_taxonomy instance variable by a string, which
     gets parsed into the requisite array form and stored in the instance
     variable'''
     self.target_taxonomy = Taxonomy.split_taxonomy(taxonomy_string)
Beispiel #35
0
 def _create_tax(self):
     # https://en.wikipedia.org/wiki/Newick_format#Examples
     return Taxonomy.from_newick("(A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F;")