def read_taxonomy(tax_f): root = TNode('*', []) tax = Taxonomy(tax_f, root) with open(tax_f) as f: for line in f: node_name, ph_str = line.strip('\r\n').split('\t') node = TNode(node_name, ph_str.split(',')) tax.add_node(node) return tax
def show_lines_not_found_in_taxonomy(double_translated, taxonomy: Taxonomy): # ['Group', 'CommonName', 'Rare', 'Total', 'TaxonOrder'] for local_name, _ in double_translated: row = taxonomy.find_local_name_row(local_name) if row is None: print(f'Not found in taxonomy: {local_name}')
def main(): import argparse usage = "%(prog)s -v" #usage=usage, parser = argparse.ArgumentParser(description=desc, epilog=epilog, \ formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--version', action='version', version='1.0b') parser.add_argument("-v", "--verbose", default=False, action="store_true", help="verbose") parser.add_argument('-t', '--taxids', nargs="+", type=int, help="group taxid(s) [%(default)s]") parser.add_argument("--taxadb", default="/users/tg/lpryszcz/cluster/rapsi/taxonomy.db3", help="taxonomy path [%(default)s]") o = parser.parse_args() if o.verbose: sys.stderr.write("Options: %s\n"%str(o)) #init taxonomy taxa = Taxonomy(o.taxadb) #init metaphors connection cur = _getConnection() cur.execute("select taxid, name from species") species = {} for taxid, name in cur.fetchall(): species[taxid] = (taxid, name) if o.verbose: sys.stderr.write("%s species in database\n"%len(species)) #process taxa groups for taxid in o.taxids: #fetch proteins from given taxa taxid2proteomes(cur, species, taxa, taxid, o.verbose)
def main(): import argparse usage = "%(prog)s -v" #usage=usage, parser = argparse.ArgumentParser(description=desc, epilog=epilog, \ formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--version', action='version', version='1.0b') parser.add_argument("-v", "--verbose", default=False, action="store_true", help="verbose") parser.add_argument('-d', '--db', default="metaphors_201405", help="database name [%(default)s]") parser.add_argument('-t', '--taxids', nargs="+", type=int, help="group taxid(s) [%(default)s]") parser.add_argument("--taxadb", default="/users/tg/lpryszcz/cluster/rapsi/taxonomy.db3", help="taxonomy path [%(default)s]") o = parser.parse_args() if o.verbose: sys.stderr.write("Options: %s\n"%str(o)) #init taxonomy taxa = Taxonomy(o.taxadb) #init metaphors connection m = dbClient.metaphors(o.db) if o.verbose: sys.stderr.write("%s species in %s database\n"%(len(m.species), o.db)) #process taxa groups for taxid in o.taxids: #fetch proteins from given taxa taxid2proteomes(m, taxa, taxid, o.verbose)
def create_row_for_missing_species( common_name: str, summary: pd.DataFrame, taxonomy: Taxonomy) -> Optional[Tuple[pd.Series, bool]]: # can also be SPUH, ISSF etc., just something that wasn't on official list # The number of columns may vary based on the checklist, but we fill # in the ones that we know must be there taxonomy_row = taxonomy.find_local_name_row(common_name) if taxonomy_row is None: # i.e. not found, drop it return None new_row = pd.Series([''] * len(summary.columns), index=summary.columns) new_row['Group'] = taxonomy_row.SPECIES_GROUP new_row['CommonName'] = common_name new_row['TaxonOrder'] = taxonomy_row.TAXON_ORDER new_row['NACC_SORT_ORDER'] = taxonomy_row.NACC_SORT_ORDER new_row['ABA_SORT_ORDER'] = taxonomy_row.ABA_SORT_ORDER new_row['Category'] = taxonomy_row.Category # Filled in later. This is the "Grand Total", not the total from an individual checklist new_row['Total'] = 0 # Not on official list, so mark it Rare if it's a species (not SPUH etc.) rarity = taxonomy_row.Category == 'species' if rarity: new_row['Rare'] = 'X' return new_row, rarity
def create_category_column(summary: pd.DataFrame, taxonomy: Taxonomy) -> list: categories = [] for common_name in summary.CommonName.values: taxonomy_row = taxonomy.find_local_name_row(common_name) category = '' if taxonomy_row is None else taxonomy_row.Category categories.append(category) return categories
def filter_additional_rare(taxonomy: Taxonomy, additional_rare: List[str]) -> List[str]: rare_species = [] for cn in additional_rare: row = taxonomy.find_local_name_row(cn) if row is not None and row.Category == 'species': rare_species.append(cn) return rare_species
def load_from_json(cls, name): """ Loads a taxonomy object from the given json file and name. """ name_with_extension = str(name) + ".json" file_name = "taxonomies/" + name_with_extension with open(file_name) as file_object: graph_json = json.load(file_object) graph = json_graph.node_link_graph(graph_json) return Taxonomy(graph)
def clean_common_names( common_names: List[str], taxonomy: Taxonomy, local_translation_context: LocalTranslationContext) -> List[str]: # skip tertiary_transformation() for now common_names = [ secondary_species_processing(pre_process_line(line)) for line in common_names ] # text_list = [tertiary_transformation(secondary_species_processing(pre_process_line(line))) \ # for line in text_list] # # Processing 1 checklist here # tti = TaxonomyTokenIdentify(taxonomy, interim_data_path) # # # use text_list from above # text_list_lower = [x.lower() for x in text_list] # possibles = filter_to_possibles(tti, text_list_lower) # print(f'Possible species lines: {len(possibles)} (based on word intersections)') # Double translate # print('Doing double translation') # Can take a while translated = [] for line in common_names: # was: possibles txline = local_translation_context.apply_translations( line.lower(), True) translated.append(txline) double_translated = [] for line, _ in translated: txline2 = local_translation_context.apply_translations( line.lower(), True) double_translated.append(txline2) double_translated = [x for (x, y) in double_translated] # print(double_translated) # they may be all lower case, return proper capitalization result = [] for common_name in double_translated: xcn = '' if common_name != '': # avoid most common exception try: row = taxonomy.find_local_name_row(common_name) xcn = row.comName except AttributeError as ae: print(ae) print(f'no taxonomy entry for "{common_name}"') result.append(xcn) return result
def read(self, parsed_taxonomies): """ Read in taxonomies for a given code table. params: taxonomies (dict{id: Taxonomy}) """ for key, taxonomy in parsed_taxonomies.iteritems(): synonym_phrases = [ Phrase(synonym) for synonym in taxonomy.synonyms ] head_phrase = Phrase(taxonomy.head) self.taxonomies[key] = Taxonomy(key, head_phrase, synonym_phrases)
def taxonomies_to_str(self): """ Convert taxonomies to strings, to be written to a file. returns: taxonomies_as_str {dict:Taxonomy(str)} """ taxonomies_as_str = {} for key, taxonomy in self.taxonomies.iteritems(): taxonomy_head = taxonomy.head.raw_form synonyms = [synonym.raw_form for synonym in taxonomy.synonyms] taxonomies_as_str[key] = Taxonomy(key, taxonomy_head, synonyms) return taxonomies_as_str
def strip_off_scientific_names(text_list: List[str], taxonomy: Taxonomy) -> List[str]: # The CAMP-2020 checklist has <Common Name> <Scientific Name> # Assume all scientific names are two words and drop stripped_text_list = [] for line in text_list: line = line.strip() # e.g. line = 'California Quail Callipepla californica' words = line.split(' ') if len(words) > 2: sci_name = ' '.join(words[-2:]).lower() row = taxonomy.find_scientific_name_row(sci_name) if row is not None: line = ' '.join(words[:-2]) #.lower() stripped_text_list.append(line) return stripped_text_list
def find_cites(contribs): tax = Taxonomy() successes = 0 failures = 0 citations = [] for contrib in contribs: row = None if "Rationale" in contrib.keys(): row = contrib["Rationale"] if "Text" in contrib.keys(): row = contrib["Text"] if row is not None: cites = search_for_citations(tax, row) entry = {"ID": contrib["ID"], "Citations": cites} citations.append(entry) successes += 1 else: failures += 1 return citations
def csv_dataframe_to_checklist( checklist: pd.DataFrame, taxonomy: Taxonomy, local_translation_context: LocalTranslationContext, observer_name: str, xdates: List[str]) -> Optional[pd.DataFrame]: # Use column names from eBird and let them be fixed by transform_checklist_details # These all have to be present for transform_checklist_details if set(checklist.columns) & {'CommonName', 'Total'} == set(): return None cleaned_common_names = clean_common_names(checklist.CommonName, taxonomy, local_translation_context) checklist.CommonName = cleaned_common_names # This will get switched back by transform_checklist_details checklist.rename(columns={'Total': 'howManyStr'}, inplace=True) xdtypes = {'CommonName': str, 'howManyStr': int} checklist = checklist.astype(dtype=xdtypes) checklist['speciesCode'] = [ taxonomy.find_species6_ebird(cn) for cn in checklist.CommonName ] checklist['locId'] = 'L5551212' checklist['subId'] = 'S5551212' checklist['groupId'] = '' checklist['durationHrs'] = 0.5 checklist['effortDistanceKm'] = 0.1 checklist['effortDistanceEnteredUnit'] = 'mi' # 'obsDt' needs dates in this form '26 Dec 2020' obsdt = normalize_date_for_visits(xdates[0]) checklist['obsDt'] = f'{obsdt} 12:01' checklist['userDisplayName'] = observer_name checklist['numObservers'] = 1 checklist['comments'] = 'Generated' # Clean up checklist = transform_checklist_details(checklist, taxonomy) return checklist
def set_target_taxonomy_by_string(self, taxonomy_string): '''Set the target_taxonomy instance variable by a string, which gets parsed into the requisite array form and stored in the instance variable''' self.target_taxonomy = Taxonomy.split_taxonomy(taxonomy_string)
NOUNSET_BANK = NounSetBank(DATA_DIR + 'nounsets.yml') NOUN_FORMS = { lang: NounFormBank(DATA_DIR + 'nouns_{}.yml'.format(lang)) for lang in LANGUAGES } PREPSET_BANK = PrepositionSetBank(DATA_DIR + 'prepsets.yml') PRONSET_BANK = PronounSetBank(DATA_DIR + 'pronsets.yml') PRONOUN_FORMS = { lang: PronounFormBank(DATA_DIR + 'prons_{}.yml'.format(lang)) for lang in LANGUAGES } TAXONOMY = Taxonomy(DATA_DIR + 'taxonomy.yml') #VERBSET_BANK = VerbSetBank(DATA_DIR + 'verbsets/') VERBSET_BANK = VerbSetBank(DATA_DIR + 'verbsets.yml') VERB_FORMS = { lang: VerbFormBank(DATA_DIR + 'verbs_{}.yml'.format(lang)) for lang in LANGUAGES } TEMPLATE_DIR = DATA_DIR + 'templates/' ADJP_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'adjp_templates.yml') ADVP_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'advp_templates.yml') CLAUSE_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'clause_templates.yml') #CUSTOM_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'custom_templates.yml') CUSTOM_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'custom_postedited.yml') NP_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'np_templates.yml')
def main(self, tree_filename, tree_format='newick', ids=None): col_delimiter = '\t|\t' row_delimiter = '\t|\n' url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz' # download the taxonomy archive filename = self.download_file(url) # extract the text dump for extract in ('nodes.dmp', 'names.dmp'): if os.path.exists(os.path.join(self.data_dir, extract)): print 'Using existing copy of %s' % extract else: print 'Extracting %s from %s...' % (extract, filename) archive = tarfile.open(name=filename, mode='r:gz') archive.extract(extract, path=self.data_dir) archive.close() # get names for all tax_ids from names.dmp print 'Getting names...' scientific_names = {} other_names = defaultdict(set) with open(os.path.join(self.data_dir, 'names.dmp')) as names_file: for line in names_file: line = line.rstrip(row_delimiter) values = line.split(col_delimiter) tax_id, name_txt, _, name_type = values[:4] if name_type == 'scientific name': scientific_names[tax_id] = name_txt else: other_names[tax_id].add(name_txt) # read all node info from nodes.dmp print 'Reading taxonomy...' nodes = {} with open(os.path.join(self.data_dir, 'nodes.dmp')) as nodes_file: for line in nodes_file: line = line.rstrip(row_delimiter) values = line.split(col_delimiter) tax_id, parent_id = values[:2] if ids: this_node = BaseTree.Clade(name=tax_id) else: this_node = BaseTree.Clade(name=scientific_names[tax_id]) nodes[tax_id] = this_node this_node.parent_id = parent_id if tree_format == 'cdao': # add common names, synonyms, mispellings, etc. as skos:altLabels if not hasattr(this_node, 'tu_attributes'): this_node.tu_attributes = [] for x in other_names[tax_id]: this_node.tu_attributes.append( ('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(x))) print 'Found %s OTUs.' % len(nodes) # create tree from nodes dictionary print 'Building tree...' for node_id, this_node in nodes.iteritems(): if node_id == this_node.parent_id: root_node = this_node print 'Found root.' else: parent_node = nodes[this_node.parent_id] parent_node.clades.append(this_node) del this_node.parent_id tree = BaseTree.Tree(root=root_node) # write tree to file print 'Writing %s tree to %s...' % (tree_format, tree_filename) bp.write([tree], tree_filename, tree_format) print 'Done!'
def main(): logger.warning("Start building taxonomy") # Load input: this includes reading network, text, and # a background corpus for contrastive analysis logger.info("Loading graph from file") A, node_info = utils.load_graph(args.data_dir, remove_citation=True, force_undirected=True) logger.info("Create HIN") G = HIN(A, node_info) logger.info("Load text") corpus = utils.load_documents(args.data_dir) motif_matchers = [ Motif_KPV(), Motif_KPA(), Motif_KP(), Motif_KPVY(), Motif_KPAA() ] intermediate_dir = plib.Path(args.data_dir, "intermediate") if not intermediate_dir.is_dir(): logger.warning(f"Creating intermediate dir {intermediate_dir}") intermediate_dir.mkdir(parents=False) # we collect all phrases T = [] # terms / phrases for info in node_info.values(): if info.node_type == "K": T.append(info.entity_id) D = corpus tf_bg, idf_bg = utils.get_tf_idf_from_file( plib.Path(args.data_dir, "background_documents.txt"), T) taxo = Taxonomy(D, T, G) builder = NetTaxo(motif_matchers, tf_lift=args.tf_lift, idf_lift=args.idf_lift, damping=args.damping, conf_motif=Motif_KPA().motif_name) # set background corpus for contrastive analysis builder.set_background(tf_bg, idf_bg) builder.build(taxo, args.levels) # save output_dir = plib.Path(args.output_dir, config.unique_id) if not output_dir.is_dir(): output_dir.mkdir(parents=True) logger.info(f"Saving to {output_dir}") taxo.save(output_dir) logger.info("Saving complete") # generate output taxo.visualize(plib.Path(output_dir, f"vis.pdf")) taxo.save_readable(output_dir)
def merge_checklists( summary_base: Any, sector_files: List[Any], stem_to_colname: Union[dict, List[str]], taxonomy: Taxonomy, local_translation_context: LocalTranslationContext ) -> Tuple[pd.DataFrame, List[str], List[str]]: # Easier to use single column summary_base, but this will transform it if needed if isinstance(summary_base, Path): template = read_excel_or_csv_path(summary_base) # Create a single column master for summary summary_base = recombine_transformed_checklist(template, taxonomy) elif isinstance(summary_base, pd.DataFrame): summary_base = summary_base base_has_adult_col = 'Ad' in summary_base.columns base_has_immature_col = 'Im' in summary_base.columns has_adult_col = False has_immature_col = False # Start of big processing loop summary = summary_base.copy() sector_unique = 1 sector_cols = [] for idx, fpath in enumerate(sector_files): try: if isinstance(fpath, Path): sector_col = stem_to_colname.get(fpath.stem, None) else: sector_col = stem_to_colname[idx] except Exception as ee: print(ee, idx, fpath) sector_col = None if not sector_col: sector_col = f'X{sector_unique}' sector_unique += 1 sector_cols.append(sector_col) print(f'Processing {sector_col}') summary_common_names = summary.CommonName.values summary_common_names_lower = [ xs.lower() for xs in summary_common_names ] if isinstance(fpath, Path): checklist = read_excel_or_csv_path(fpath) # Only Excel files would be double column. CSV files could be hand made, # so clean them up. Double translation takes a long time, so avoid when # possible if fpath.suffix == '.xlsx': checklist = recombine_transformed_checklist( checklist, taxonomy) else: cleaned_common_names = clean_common_names( checklist.CommonName, taxonomy, local_translation_context) checklist.CommonName = cleaned_common_names # print(checklist.Total) xdtypes = {'CommonName': str, 'Total': int} checklist = checklist.astype(dtype=xdtypes) # so = pd.to_numeric(summary.NACC_SORT_ORDER, errors='coerce') # summary.NACC_SORT_ORDER = pd.Series(so).fillna(taxonomy.INVALID_NACC_SORT_ORDER) else: # isinstance(summary_base, pd.DataFrame): checklist = fpath # Drop any rows with a blank CommonName. This can occur if the checklist is a summary # report with a 'Total' row at the bottom, and 'Total' is not a valid species checklist = checklist[checklist.CommonName != ''] # Sector checklists may have added species not on the template checklist['cnlower'] = [xs.lower() for xs in checklist.CommonName] checklist_common_names_lower = set( [xs.lower() for xs in checklist.CommonName]) names_to_add = checklist_common_names_lower - set( summary_common_names_lower) if not names_to_add == set(): species_to_add = taxonomy.filter_species(list(names_to_add)) if len(species_to_add) > 0: print(f'Added species: {species_to_add}') # Fix capitalization names_to_add = clean_common_names(list(names_to_add), taxonomy, local_translation_context) blank_row = pd.Series([''] * len(summary.columns), index=summary.columns) rows_to_add = [] for cn in names_to_add: row = blank_row.copy() row['CommonName'] = cn if cn.lower() in species_to_add: row['Rare'] = 'X' total = checklist[checklist.cnlower == cn.lower()]['Total'].values[0] row[sector_col] = total rows_to_add.append(row) summary = summary.append(rows_to_add, ignore_index=True) # has_adult_col = 'Ad' in checklist.columns has_immature_col = 'Im' in checklist.columns summary[sector_col] = 0 # 'Total' field for this sector if has_adult_col: ad_col = f'Ad-{sector_col}' summary[ad_col] = 0 if has_immature_col: im_col = f'Im-{sector_col}' summary[im_col] = 0 # # S # # Fill in total for existing names # already_present_names = set(summary_common_names) & set(checklist.CommonName) # for cn in set(checklist.CommonName): # total = checklist[checklist.CommonName == cn]['Total'].values[0] # summary.loc[summary.CommonName == cn, sector_col] = total # print(summary.shape, len(summary_common_names_lower)) summary_common_names_lower = [xs.lower() for xs in summary.CommonName] summary['cnlower'] = summary_common_names_lower for ix, row in checklist.iterrows(): # if row.Total: # print(row) total = row.FrozenTotal if 'FrozenTotal' in checklist.columns else row.Total mask = summary.cnlower == row.cnlower summary.loc[mask, sector_col] = total summary.drop(['cnlower'], axis=1, inplace=True) # if has_adult_col: # adult_total = checklist[checklist.CommonName == cn]['Ad'].values[0] # summary.loc[summary.CommonName == cn, ad_col] = adult_total # # if has_immature_col: # immature_total = checklist[checklist.CommonName == cn]['Im'].values[0] # summary.loc[summary.CommonName == cn, im_col] = immature_total # Fill in zeros for missing sector_col values; may have blanks if species added # for col in sector_cols: # summary[col] = summary[col].apply(pd.to_numeric).fillna(0) # Do sums for Ad/Im columns. Ad == 'Adult/White' if base_has_adult_col: ad_cols = [xs for xs in summary.columns if xs.startswith('Ad-')] summary['Ad'] = summary[ad_cols].apply( pd.to_numeric).fillna(0).sum(axis=1).astype(int) if base_has_immature_col: im_cols = [xs for xs in summary.columns if xs.startswith('Im-')] summary['Im'] = summary[im_cols].apply( pd.to_numeric).fillna(0).sum(axis=1).astype(int) # Look up Group and TaxonOrder for anything missing these (may have been added species) for idx, row in summary.iterrows(): record = taxonomy.find_local_name_row(row['CommonName']) if record is not None: summary.at[idx, 'TaxonOrder'] = record.TAXON_ORDER summary.at[idx, 'Group'] = record.SPECIES_GROUP so = record.NACC_SORT_ORDER if record.NACC_SORT_ORDER != 0 else \ taxonomy.INVALID_NACC_SORT_ORDER summary.at[idx, 'NACC_SORT_ORDER'] = so so = record.ABA_SORT_ORDER if record.ABA_SORT_ORDER != 0 else \ taxonomy.INVALID_NACC_SORT_ORDER summary.at[idx, 'ABA_SORT_ORDER'] = so summary.at[idx, 'Category'] = record.Category # Re-sort by TaxonOrder # Must sort before creating formulae for Total so = pd.to_numeric(summary.NACC_SORT_ORDER, errors='coerce') summary.NACC_SORT_ORDER = pd.Series(so).fillna( taxonomy.INVALID_NACC_SORT_ORDER) so = pd.to_numeric(summary.ABA_SORT_ORDER, errors='coerce') summary.ABA_SORT_ORDER = pd.Series(so).fillna( taxonomy.INVALID_NACC_SORT_ORDER) try: summary = summary.sort_values(by=['NACC_SORT_ORDER']).reset_index( drop=True) except TypeError as te: print(te) traceback.print_exc(file=sys.stdout) return summary # Now set the overall total field: # sector_cols = [xs for xs in summary.columns if xs.startswith('Sector')] # summary['Total'] = summary[sector_cols].apply(pd.to_numeric).fillna(0).sum(axis=1).astype(int) col_letters = excel_columns() # team_start_col = col_letters[len(base_columns)] std_columns = [ 'Group', 'CommonName', 'Rare', 'Total', 'Category', 'TaxonOrder', 'NACC_SORT_ORDER', 'ABA_SORT_ORDER' ] # Filter out any missing columns std_columns = [col for col in std_columns if col in summary.columns] # team_start_col = col_letters[index_of_first_subtotal_column(summary)] sector_start_col = col_letters[len(std_columns)] sector_end_col = col_letters[len(summary.columns) - 1] total_formula = [ f'=SUM(${sector_start_col}{ix}:${sector_end_col}{ix})' for ix in range(2, summary.shape[0] + 2) ] summary['Total'] = total_formula # Add last row for Total and each Sector total totals_row = pd.Series([''] * len(summary.columns), index=summary.columns) totals_row['Group'] = 'Totals' totals_row['TaxonOrder'] = 99999 totals_row['NACC_SORT_ORDER'] = taxonomy.INVALID_NACC_SORT_ORDER totals_row['ABA_SORT_ORDER'] = taxonomy.INVALID_NACC_SORT_ORDER # Formula for Grand Total, e.g. =SUM($D$2:$D$245) total_col_letter = col_letters[std_columns.index('Total')] total_formula = f'=SUM(${total_col_letter}2:${total_col_letter}{summary.shape[0] + 1})' totals_row.Total = total_formula # sector_cols = [xs for xs in summary.columns if xs.startswith('Sector')] sector_totals = summary[sector_cols].apply( pd.to_numeric).fillna(0).sum(axis=0).astype(int) for col, st in sector_totals.items(): totals_row[col] = st summary = summary.append(totals_row, ignore_index=True) cols_to_drop = [ col for col in summary.columns if (col.startswith('Ad-') or col.startswith('Im-')) ] summary.drop(labels=cols_to_drop, axis=1, inplace=True) summary.rename(columns={ 'Ad': 'Adult/White', 'Im': 'Immature/Blue' }, inplace=True) # Re-order columns # print(sector_cols) # print(summary.columns) new_col_order = [ col for col in [ 'Group', 'CommonName', 'Rare', 'Total', 'Category', 'TaxonOrder', 'NACC_SORT_ORDER', 'ABA_SORT_ORDER' ] if col in summary.columns ] new_col_order.extend(sector_cols) summary = summary[new_col_order] # Don't hide 'Rare' since this will be frequently used in a filter cols_to_hide = [ 'D', 'Difficulty', 'Adult', 'Immature', 'W-morph', 'B-Morph' ] if 'Adult/White' in summary.columns: if summary['Adult/White'].apply(pd.to_numeric).fillna(0).sum() == 0: cols_to_hide.append('Adult/White') if 'Immature/Blue' in summary.columns: if summary['Immature/Blue'].apply(pd.to_numeric).fillna(0).sum() == 0: cols_to_hide.append('Immature/Blue') cols_to_highlight = list( set(summary.columns) & {'Total', 'Adult/White', 'Immature/Blue'}) return summary, cols_to_hide, cols_to_highlight
def expand_node(self, taxo: Taxonomy, node: TaxoNode, level: int, n_children: int, max_level: int): """Expand a taxonomy node. Args: taxo: The Taxonomy object. node: The node to expand. level: Current level of taxonomy. n_children: Number of children to expand to next level. """ logger.info(f"Expand node {node.prefix}") logger.info("1. Contrast analysis for term scoring") if node.parent is None: raise RuntimeError( "Contrastive analysis assumes parent node is set.") else: term_scores = contrast.node_contrast_analysis( node, node.parent, taxo.siblings(node), self.tf_lift * (config.LEVEL_DECAY**level), self.idf_lift * (config.LEVEL_DECAY**level), ) for term, old_score in term_scores.items(): term_scores[term] = old_score * node.term_prior[term] node.term_scores = term_scores logger.debug("Top terms for this node") logger.debug( str([ utils.strip_phrase_tags(phrase) for phrase in utils.take_topk(term_scores, 20) ])) logger.info( f"check stopping criteria, level {level} >= {max_level} is {level >= max_level}" ) if level >= max_level: return logger.info("Generate motif context") generate_motif_context(args.data_dir, level, taxo.G, node.terms, node.docs, self.motif_matchers) sample_motif_context(args, level, self.conf_motif) logger.info("2. Local embedding") word_embed, net_embed = loc_emb.local_embedding(node, args.data_dir) wv_word = word_embed.syn0 wv_net = net_embed.syn0 logger.info("3. Term clustering") logger.debug(f"#term_scores {len(term_scores)}") topk = min(config.N_TOP_TERMS, int(config.TOP_TERMS_PCT * len(term_scores))) topk_terms = utils.take_topk(term_scores, topk) clus_labels, aligned_terms = clus.term_clustering( topk_terms, wv_word, n_clusters=n_children) clus_labels_net, aligned_terms_net = clus.term_clustering( topk_terms, wv_net, n_clusters=n_children) map_ab = clus.align_clustering(clus_labels, clus_labels_net) logger.info("4. Anchor phrase selection") # anchor phrase selection w/ intersection # WT_clusters = [] # term weights term_weights_clusters = [] for i in range(n_children): # get all terms in the cluster clus_terms_word = set( clus.get_cluster_terms(clus_labels, aligned_terms, i)) clus_terms_net = set( clus.get_cluster_terms(clus_labels_net, aligned_terms_net, map_ab[i])) clus_terms = clus_terms_word & clus_terms_net term_nids = _term2nid(taxo.G, clus_terms) # get associated documents D_c, weights_c = clus.get_cluster_documents( taxo.G, node.docs, term_nids) # run contrastive analysis tf_c, idf_c = utils.get_tf_idf(D_c, clus_terms, weights=weights_c) next_level = level + 1 term_scores_c = contrast.contrast_analysis( tf_c, idf_c, node.tf, node.idf, self.tf_lift * (config.LEVEL_DECAY**next_level), self.idf_lift * (config.LEVEL_DECAY**next_level)) term_weights_clusters.append(term_scores_c) logger.debug("Cluster {}:: ".format(i) + str([ utils.strip_phrase_tags(phrase) for phrase in utils.take_topk(term_scores_c, 30) ])) logger.info("5. Motif selection") cluster_seeds = [] n_seed = config.N_ANCHOR_TERMS for i in range(n_children): seed_phrases = utils.take_topk(term_weights_clusters[i], n_seed) cluster_seeds.append(seed_phrases) motif_selection_sampling(args, level, cluster_seeds, keep_ratio=config.TOP_MOTIF_PCT) logger.info("6. Recompute embedding") joint_embed = loc_emb.joint_local_embedding(node, args.data_dir) wv_all = joint_embed.syn0 logger.info("7. Soft clustering") clus_labels, aligned_terms, vmf = clus.soft_clustering( topk_terms, wv_all, n_clusters=n_children) logger.info("8. Generate next level") term_prior_clusters = [] cluster_centers = [] for i in range(n_children): # get all terms in the cluster clus_terms = clus.get_cluster_terms(clus_labels, aligned_terms, i) term_nids = _term2nid(taxo.G, clus_terms) # get associated documents D_c, weights_c = clus.get_cluster_documents( taxo.G, node.docs, term_nids) # run contrastive analysis tf_c, idf_c = utils.get_tf_idf(D_c, clus_terms, weights=weights_c) term_scores_c = contrast.contrast_analysis(tf_c, idf_c, node.tf, node.idf) term_prior_clusters.append(term_scores_c) logger.debug("Cluster {}:: ".format(i) + str([ utils.strip_phrase_tags(phrase) for phrase in utils.take_topk(term_scores_c, 30) ])) # generate next level terms and documents # compute clustering probability X = [] X_terms = [] for term in node.terms: try: X.append(wv_all[term]) X_terms.append(term) except: pass X = np.vstack(X) clustering_probs = clus.get_soft_cluster_probs(X, vmf.cluster_centers_, vmf.weights_, vmf.concentrations_) clustering_probs = clustering_probs.T for idx_c in range(n_children): # find words in each cluster terms_c = [] term_prior_c = dict() for i in range(X.shape[0]): if clustering_probs[i, idx_c] > 2 * (1 / n_children): terms_c.append(X_terms[i]) term_prior_c[X_terms[i]] = clustering_probs[i, idx_c] # find documents associated with each cluster ranking, clustering_probs_net = clus.populate_clustering( taxo.G, n_children, term_prior_clusters, damping=0.8) doc_prior_c = dict() docs_c = dict() for paper_id, paper_content in node.docs.items(): nid = taxo.G.find_by_entity_id("P", paper_id) score = clustering_probs_net[nid, idx_c] if score <= 2 * (1 / n_children): continue docs_c[paper_id] = paper_content doc_prior_c[paper_id] = score node_c = TaxoNode(node.prefix + "/{}".format(idx_c), docs_c, terms_c, doc_prior_c, term_prior_c) curr = node_c node_c.set_parent(node) node.add_child(node_c)
def taxonomy_array(self): return Taxonomy.split_taxonomy(self.taxonomy)
def main(self, tree_filename, tree_format='newick'): col_delimiter = '\t' url = 'http://ecat-dev.gbif.org/repository/export/checklist1.zip' # download the taxonomy archive filename = self.download_file(url) # extract the tables extract = 'taxon.txt' if os.path.exists(os.path.join(self.data_dir, extract)): print 'Using existing copy of %s' % extract else: print 'Extracting %s from %s...' % (extract, filename) archive = zipfile.ZipFile(filename, mode='r') archive.extract(extract, path=self.data_dir) archive.close() # build BioPython clades print 'Reading taxonomy...' nodes = {} with open(os.path.join(self.data_dir, 'taxon.txt')) as taxonomy_file: for line in taxonomy_file: line = line.strip() values = line.split(col_delimiter) id, parent_id, syn_id, _, name, _, status = values[:7] # skip incertae sedis taxa if id == '0': continue if syn_id and not 'synonym' in status: continue elif syn_id and 'synonym' in status: if tree_format == 'cdao': nodes[id] = ('synonym', name, syn_id) elif not syn_id: nodes[id] = BaseTree.Clade(name=name) nodes[id].parent_id = parent_id print 'Found %s OTUs.' % len(nodes) nodes[''] = root_node = BaseTree.Clade() # create tree from nodes dictionary print 'Building tree...' for node_id, this_node in nodes.iteritems(): if not node_id: continue if isinstance(this_node, BaseTree.Clade): try: parent_node = nodes[this_node.parent_id] parent_node.clades.append(this_node) del this_node.parent_id except (KeyError, AttributeError): pass elif this_node[0] == 'synonym': _, name, syn_id = this_node try: accepted_node = nodes[syn_id] except KeyError: continue if not isinstance(accepted_node, BaseTree.Clade): continue if not hasattr(accepted_node, 'tu_attributes'): nodes[syn_id].tu_attributes = [] nodes[syn_id].tu_attributes.append(('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(name))) #print 'Synonym: %s -> %s' % (name, nodes[syn_id].name) tree = BaseTree.Tree(root=root_node) # write tree to file print 'Writing %s tree to %s...' % (tree_format, tree_filename) bp.write([tree], tree_filename, tree_format) print 'Done!'''
def main(self, tree_filename, tree_format='newick'): col_delimiter = '|' url = 'http://www.itis.gov/downloads/itisMySQLTables.tar.gz' # download the taxonomy archive filename = self.download_file(url) # extract the tables for extract in ('taxonomic_units', 'longnames', 'synonym_links', 'vernaculars'): if os.path.exists(os.path.join(self.data_dir, extract)): print 'Using existing copy of %s' % extract else: print 'Extracting %s from %s...' % (extract, filename) archive = tarfile.open(name=filename, mode='r:gz') full_extract = [x for x in archive.getnames() if x.split('/')[-1] == extract][0] member = archive.getmember(full_extract) member.name = extract archive.extract(extract, path=self.data_dir) archive.close() # get names for all ITIS TSNs from longnames table print 'Getting names...' names = {} with open(os.path.join(self.data_dir, 'longnames')) as names_file: for line in names_file: line = line.strip() values = line.split(col_delimiter) tax_id, name = values names[tax_id] = name # read all node info from taxonomic_units print 'Reading taxonomy...' nodes = {} with open(os.path.join(self.data_dir, 'taxonomic_units')) as nodes_file: for line in nodes_file: line = line.strip() values = line.split(col_delimiter) (tax_id, usage, parent_id, uncertain_parent) = [values[n] for n in (0, 10, 17, 23)] #if uncertain_parent: continue if not usage in ('accepted', 'valid'): continue name = names[tax_id] this_node = BaseTree.Clade(name=name) nodes[tax_id] = this_node this_node.parent_id = parent_id other_names = defaultdict(set) if tree_format == 'cdao': # get synonym definitions print 'Getting synonyms...' with open(os.path.join(self.data_dir, 'synonym_links')) as synonym_file: for line in synonym_file: line = line.strip() values = line.split(col_delimiter) node_id, syn_id, _ = values nodes[node_id] = ('synonym', names[node_id], syn_id) with open(os.path.join(self.data_dir, 'vernaculars')) as synonym_file: for line in synonym_file: line = line.strip() values = line.split(col_delimiter) tax_id, name = values[:2] other_names[tax_id].add(name) print 'Found %s OTUs.' % len(nodes) nodes['0'] = root_node = BaseTree.Clade() # create tree from nodes dictionary print 'Building tree...' for node_id, this_node in nodes.iteritems(): if node_id == '0': continue if isinstance(this_node, BaseTree.Clade): try: parent_node = nodes[this_node.parent_id] parent_node.clades.append(this_node) except (KeyError, AttributeError): continue del this_node.parent_id if not hasattr(this_node, 'tu_attributes'): this_node.tu_attributes = [] for name in other_names[node_id]: this_node.tu_attributes.append(('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(name))) elif this_node[0] == 'synonym': _, name, syn_id = this_node try: accepted_node = nodes[syn_id] except KeyError: continue if not isinstance(accepted_node, BaseTree.Clade): continue if not hasattr(accepted_node, 'tu_attributes'): nodes[syn_id].tu_attributes = [] nodes[syn_id].tu_attributes.append(('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(name))) #print 'Synonym: %s -> %s' % (name, nodes[syn_id].name) tree = BaseTree.Tree(root=root_node) # write tree to file print 'Writing %s tree to %s...' % (tree_format, tree_filename) bp.write([tree], tree_filename, tree_format) print 'Done!'''
def main(self, tree_filename, tree_format='newick', ids=None): col_delimiter = '\t|\t' row_delimiter = '\t|\n' url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz' # download the taxonomy archive filename = self.download_file(url) # extract the text dump for extract in ('nodes.dmp', 'names.dmp'): if os.path.exists(os.path.join(self.data_dir, extract)): print 'Using existing copy of %s' % extract else: print 'Extracting %s from %s...' % (extract, filename) archive = tarfile.open(name=filename, mode='r:gz') archive.extract(extract, path=self.data_dir) archive.close() # get names for all tax_ids from names.dmp print 'Getting names...' scientific_names = {} other_names = defaultdict(set) with open(os.path.join(self.data_dir, 'names.dmp')) as names_file: for line in names_file: line = line.rstrip(row_delimiter) values = line.split(col_delimiter) tax_id, name_txt, _, name_type = values[:4] if name_type == 'scientific name': scientific_names[tax_id] = name_txt else: other_names[tax_id].add(name_txt) # read all node info from nodes.dmp print 'Reading taxonomy...' nodes = {} with open(os.path.join(self.data_dir, 'nodes.dmp')) as nodes_file: for line in nodes_file: line = line.rstrip(row_delimiter) values = line.split(col_delimiter) tax_id, parent_id = values[:2] if ids: this_node = BaseTree.Clade(name=tax_id) else: this_node = BaseTree.Clade(name=scientific_names[tax_id]) nodes[tax_id] = this_node this_node.parent_id = parent_id if tree_format == 'cdao': # add common names, synonyms, mispellings, etc. as skos:altLabels if not hasattr(this_node, 'tu_attributes'): this_node.tu_attributes = [] for x in other_names[tax_id]: this_node.tu_attributes.append(('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(x))) print 'Found %s OTUs.' % len(nodes) # create tree from nodes dictionary print 'Building tree...' for node_id, this_node in nodes.iteritems(): if node_id == this_node.parent_id: root_node = this_node print 'Found root.' else: parent_node = nodes[this_node.parent_id] parent_node.clades.append(this_node) del this_node.parent_id tree = BaseTree.Tree(root=root_node) # write tree to file print 'Writing %s tree to %s...' % (tree_format, tree_filename) bp.write([tree], tree_filename, tree_format) print 'Done!'
# 根据筛选出来的信息 实例化Page和Taxonomy对象 page = tk.result_list_page tax = tk.result_list_tax # 目标对象列表 aim_page = [] aim_tax = [] for item in page: #实例化页面 aim_page.append(Page(item = item)) for item in tax: #实例化分类 aim_tax.append(Taxonomy(tax_name=item['type'],item=item)) # ----------------------------------- 小艾 ------------------------------------------ xiaoai = XiaoAi() # 将任务加入小艾的队列 for item in aim_page: xiaoai.add_task(item) for item in aim_tax: xiaoai.add_task(item) # 开始执行自动化生成
def assign_taxonomy(self, key, output_dir, dna_region, names_file, ref_taxa): from taxonomy import Taxonomy, consensus #results = uc_results results = {} try: self.runobj.run_status_file_h.write( json.dumps({'status': "STARTING_ASSIGN_TAXONOMY: " + key}) + "\n") except: pass #test_read='FI1U8LC02GEF7N' # open gast_file to get results "to Dirs" tagtax_terse_filename = os.path.join(output_dir, "tagtax_terse") tagtax_long_filename = os.path.join(output_dir, "tagtax_long") tagtax_terse_fh = open(tagtax_terse_filename, 'w') tagtax_long_fh = open(tagtax_long_filename, 'w') tagtax_long_fh.write("\t".join([ "read_id", "taxonomy", "distance", "rank", "refssu_count", "vote", "minrank", "taxa_counts", "max_pcts", "na_pcts", "refhvr_ids" ]) + "\n") gast_file = os.path.join(output_dir, "gast" + dna_region) if not os.path.exists(gast_file): logging.info("gast:assign_taxonomy: Could not find gast file: " + gast_file + ". Returning") return results for line in open(gast_file, 'r'): # must split on tab because last field may be empty and must be maintained as blank data = line.strip().split("\t") if len(data) == 3: data.append("") # 0=id, 1=ref, 2=dist, 3=align 4=frequency #if data[0]==test_read: # print 'found test in gastv6 ', data[1].split('|')[0], data[2], data[3] read_id = data[0] if read_id in results: results[read_id].append( [data[1].split('|')[0], data[2], data[3], data[4]]) else: results[read_id] = [[ data[1].split('|')[0], data[2], data[3], data[4] ]] for line in open(names_file, 'r'): data = line.strip().split("\t") dupes = data[1].split(",") read_id = data[0] taxObjects = [] distance = 0 frequency = 0 refs_for = {} #print 'read_id', read_id 'assing taxonomyt method, either fake or real' if read_id not in results: results[read_id] = [ "Unknown", '1', "NA", '0', '0', "NA", "0;0;0;0;0;0;0;0", "0;0;0;0;0;0;0;0", "100;100;100;100;100;100;100;100" ] refs_for[read_id] = ["NA"] else: 'it is in results[]' #print 'read_id in res', read_id, results[read_id] #if read_id == test_read_id: # print 'found ', test_read_id, results[test_read_id] for i in range(0, len(results[read_id])): #for resultread_id in results[read_id]: #print 'resread_id', results[read_id] ref = results[read_id][i][0] if ref in ref_taxa: for tax in ref_taxa[ref]: for t in tax: taxObjects.append(Taxonomy(t)) else: pass if read_id in refs_for: #if read_id ==test_read_id: # print '2', read_id, refs_for[test_read_id] if results[read_id][i][0] not in refs_for[read_id]: refs_for[read_id].append(results[read_id][i][0]) else: #if read_id == test_read_id: # print '1', read_id, results[read_id][i][0] refs_for[read_id] = [results[read_id][i][0]] # should all be the same distance for the duplicates distance = results[read_id][i][1] frequency = results[read_id][i][3] #Lookup the consensus taxonomy for the array taxReturn = consensus(taxObjects, C.majority) # 0=taxObj, 1=winning vote, 2=minrank, 3=rankCounts, 4=maxPcts, 5=naPcts; taxon = taxReturn[0].taxstring() #if taxon[-3:] = ';NA': # taxon = taxon[:-3] #tax_counter[taxon] rank = taxReturn[0].depth() #print read_id, taxon, rank, taxReturn[0], taxReturn[1] if not taxon: taxon = "Unknown" # (taxonomy, distance, rank, refssu_count, vote, minrank, taxa_counts, max_pcts, na_pcts) results[read_id] = [ taxon, str(distance), rank, str(len(taxObjects)), str(taxReturn[1]), taxReturn[2], taxReturn[3], taxReturn[4], taxReturn[5] ] #print "\t".join([read_id, taxon, str(distance), rank, str(len(taxObjects)), str(taxReturn[1]), taxReturn[2], taxReturn[3], taxReturn[4], taxReturn[5]]) + "\n" #read_id_id taxonomy distance rank refssu_count vote minrank taxa_counts max_pcts na_pcts refhvr_ids #D4ZHLFP1:25:B022DACXX:3:1101:12919:40734 1:N:0:TGACCA|frequency:162 Bacteria;Proteobacteria;Gammaproteobacteria 0.117 class 2 100 genus 1;1;1;2;2;2;0;0 100;100;100;50;50;50;0;0 0;0;0;0;0;0;100;100 v6_CI671 #D4ZHLFP1:25:B022DACXX:3:1101:10432:76870 1:N:0:TGACCA|frequency:105 Bacteria;Proteobacteria;Gammaproteobacteria 0.017 class 1 100 class 1;1;1;0;0;0;0;0 100;100;100;0;0;0;0;0 0;0;0;100;100;100;100;100 v6_BW306 # Replace hash with final taxonomy results, for each copy of the sequence for d in dupes: # print OUT join("\t", $d, @{$results{$read_id}}, join(", ", sort @{$refs_for{$read_id}})) . "\n"; d = d.strip() tagtax_long_fh.write(d + "\t" + "\t".join(results[read_id]) + "\t" + ', '.join(sorted(refs_for[read_id])) + "\n") tagtax_terse_fh.write(d + "\t" + results[read_id][0] + "\t" + results[read_id][2] + "\t" + results[read_id][3] + "\t" + ', '.join(sorted(refs_for[read_id])) + "\t" + results[read_id][1] + "\t" + str(frequency) + "\n") tagtax_terse_fh.close() tagtax_long_fh.close() return results
def main(self, tree_filename, tree_format='newick'): col_delimiter = '|' url = 'http://www.itis.gov/downloads/itisMySQLTables.tar.gz' # download the taxonomy archive filename = self.download_file(url) # extract the tables for extract in ('taxonomic_units', 'longnames', 'synonym_links', 'vernaculars'): if os.path.exists(os.path.join(self.data_dir, extract)): print 'Using existing copy of %s' % extract else: print 'Extracting %s from %s...' % (extract, filename) archive = tarfile.open(name=filename, mode='r:gz') full_extract = [ x for x in archive.getnames() if x.split('/')[-1] == extract ][0] member = archive.getmember(full_extract) member.name = extract archive.extract(extract, path=self.data_dir) archive.close() # get names for all ITIS TSNs from longnames table print 'Getting names...' names = {} with open(os.path.join(self.data_dir, 'longnames')) as names_file: for line in names_file: line = line.strip() values = line.split(col_delimiter) tax_id, name = values names[tax_id] = name # read all node info from taxonomic_units print 'Reading taxonomy...' nodes = {} with open(os.path.join(self.data_dir, 'taxonomic_units')) as nodes_file: for line in nodes_file: line = line.strip() values = line.split(col_delimiter) (tax_id, usage, parent_id, uncertain_parent) = [values[n] for n in (0, 10, 17, 23)] #if uncertain_parent: continue if not usage in ('accepted', 'valid'): continue name = names[tax_id] this_node = BaseTree.Clade(name=name) nodes[tax_id] = this_node this_node.parent_id = parent_id other_names = defaultdict(set) if tree_format == 'cdao': # get synonym definitions print 'Getting synonyms...' with open(os.path.join(self.data_dir, 'synonym_links')) as synonym_file: for line in synonym_file: line = line.strip() values = line.split(col_delimiter) node_id, syn_id, _ = values nodes[node_id] = ('synonym', names[node_id], syn_id) with open(os.path.join(self.data_dir, 'vernaculars')) as synonym_file: for line in synonym_file: line = line.strip() values = line.split(col_delimiter) tax_id, name = values[:2] other_names[tax_id].add(name) print 'Found %s OTUs.' % len(nodes) nodes['0'] = root_node = BaseTree.Clade() # create tree from nodes dictionary print 'Building tree...' for node_id, this_node in nodes.iteritems(): if node_id == '0': continue if isinstance(this_node, BaseTree.Clade): try: parent_node = nodes[this_node.parent_id] parent_node.clades.append(this_node) except (KeyError, AttributeError): continue del this_node.parent_id if not hasattr(this_node, 'tu_attributes'): this_node.tu_attributes = [] for name in other_names[node_id]: this_node.tu_attributes.append( ('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(name))) elif this_node[0] == 'synonym': _, name, syn_id = this_node try: accepted_node = nodes[syn_id] except KeyError: continue if not isinstance(accepted_node, BaseTree.Clade): continue if not hasattr(accepted_node, 'tu_attributes'): nodes[syn_id].tu_attributes = [] nodes[syn_id].tu_attributes.append( ('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(name))) #print 'Synonym: %s -> %s' % (name, nodes[syn_id].name) tree = BaseTree.Tree(root=root_node) # write tree to file print 'Writing %s tree to %s...' % (tree_format, tree_filename) bp.write([tree], tree_filename, tree_format) print 'Done!' ''
def build_full_tally_sheet(double_translated, fpath: Path, taxonomy: Taxonomy, parameters: Parameters, circle_prefix: str): candidate_names = [x for x, y in double_translated] local_names = process_exceptions(candidate_names, fpath, circle_prefix) # if issf etc in list, then base species must be also issfs = taxonomy.filter_issf(local_names) for cn in issfs: base_species = taxonomy.report_as(cn) if base_species: local_names.append(base_species) entries = [] for local_name in local_names: # common_name, taxon_order, species_group, NACC_SORT_ORDER record = taxonomy.find_local_name_row(local_name) if record is not None: # e.g. ('White-throated Sparrow', 31943, 'New World Sparrows', 1848.0) entry = (record.comName, record.TAXON_ORDER, record.SPECIES_GROUP, record.NACC_SORT_ORDER, record.ABA_SORT_ORDER, '', 0 ) # append 'Rare', 'Total' entries.append(entry) df = pd.DataFrame(entries, columns=[ 'CommonName', 'TaxonOrder', 'Group', 'NACC_SORT_ORDER', 'ABA_SORT_ORDER', 'Rare', 'Total' ]) # Re-order cols = [ 'Group', 'CommonName', 'Rare', 'Total', 'TaxonOrder', 'NACC_SORT_ORDER', 'ABA_SORT_ORDER' ] local_checklist = df[cols] local_checklist.sort_values(by='TaxonOrder', inplace=True) # local_checklist.shape # double_translated may have duplicates local_checklist = local_checklist[ ~local_checklist.duplicated(subset=['CommonName'], keep='first')] local_checklist = process_annotations_or_rarities(local_checklist, fpath, circle_prefix) # Re-order columns preferred_order = [ 'Group', 'CommonName', 'Rare', 'D', 'Total', 'Ad', 'Im', 'TaxonOrder', 'NACC_SORT_ORDER', 'ABA_SORT_ORDER', 'Difficulty', 'Adult', 'Immature', 'W-morph', 'B-Morph', 'CountSpecial' ] newcols = [ col for col in preferred_order if col in local_checklist.columns ] local_checklist = local_checklist[newcols] # Write out full tally sheet # circle_code = circle_prefix[0:4] # double_path = outputs_path / f'{circle_code}-DoubleX.xlsx' # write_local_checklist_with_group(local_checklist, double_path, parameters.parameters) return local_checklist
# # TODO: list-types is missing # INFO = """ This the CLI for the Orange Button Core library. Information is available at the following URL's: Orange Button Overview: https://sunspec.org/orange-button-initiative/ Orange Button GitHUb: https://github.com/SunSpecOrangeButton Orange Button CLI GitHub: https://github.com/SunSpecOrangeButton/core """ DASHES = "---------------------------------------------------------------------------------------" taxonomy = Taxonomy() csv = False json = False xml = False def info(args): print(INFO) def convert(args): p = Parser(taxonomy) ff = None if json:
""" Script used to generate ncbi_subset_tax.json NCBI taxonomy downloaded in 2019 July 1st. """ from taxonomy import Taxonomy tax = Taxonomy.from_ncbi("nodes.dmp", "names.dmp") new_tax = tax.prune(remove=['28384', '12908', '10239', '2', '2157'] + [t for t in tax.children("2759") if t != "543769"]) with open("new_tax.json", "wb") as f: f.write(new_tax.to_json(True))
def hits2taxa(input, out, db, verbose, limit=0): """Process fastq from input file. You may play with bufsize, so processes runs without waiting. """ #init taxonomy taxa = Taxonomy(db) #hadle gzipped/bzip2 stream if input.name.endswith('.gz'): input = gzip.open(input.name) elif input.name.endswith('.bz2'): import bz2 input = bz2.BZ2File(input.name) #get match generator if input == sys.stdin: line0 = input.readline() if line0.startswith('@'): mGenerator = get_matches_sam(input, verbose) else: mGenerator = get_matches_blast8(input, verbose) #get sam stream elif input.name.endswith(('.sam', '.sam.gz')): mGenerator = get_matches_sam(input, verbose) else: mGenerator = get_matches_blast8(input, verbose) #process reads in 1K batches?print if verbose: sys.stderr.write("[%s] Processing reads from %s ...\n" % (datetime.ctime(datetime.now()), input.name)) #get taxa and genes taxid2reads = {} taxid2matches = {} k = 0 for i, (rname, hits) in enumerate(mGenerator, 1): if limit and i > limit: break if not rname: continue #print info if verbose and i % 1e4 == 1: sys.stderr.write(" %s parsed. %.2f%s with taxa \r" % (i, k * 100.0 / i, '%')) #get taxa taxid, matches = get_taxa(hits, taxa, verbose) if not taxid: continue k += 1 if taxid not in taxid2reads: taxid2reads[taxid] = 0 #store read name & genes taxid2reads[taxid] += 1 #report if not taxid2reads: sys.exit("No matches found!") ##foreign reads freads = sum(reads for taxid, reads in taxid2reads.iteritems()) header = "#name\ttaxid\treads\t%\n" out.write(header) out.write("%s\t%s\t%s\t%.2f\n" % ("unknown", "-", i - freads, 100.0 * (i - freads) / i)) for taxid, reads in sorted(taxid2reads.iteritems(), key=lambda x: x[1], reverse=True)[:10]: out.write("%s\t%s\t%s\t%.2f\n" % (taxa[taxid][1], taxid, reads, 100.0 * reads / i)) #print summary sys.stderr.write("[hits2taxa] %s entries processed!\n" % (i, ))
def main(self, tree_filename, tree_format='newick'): col_delimiter = '\t' url = 'http://ecat-dev.gbif.org/repository/export/checklist1.zip' # download the taxonomy archive filename = self.download_file(url) # extract the tables extract = 'taxon.txt' if os.path.exists(os.path.join(self.data_dir, extract)): print 'Using existing copy of %s' % extract else: print 'Extracting %s from %s...' % (extract, filename) archive = zipfile.ZipFile(filename, mode='r') archive.extract(extract, path=self.data_dir) archive.close() # build BioPython clades print 'Reading taxonomy...' nodes = {} with open(os.path.join(self.data_dir, 'taxon.txt')) as taxonomy_file: for line in taxonomy_file: line = line.strip() values = line.split(col_delimiter) id, parent_id, syn_id, _, name, _, status = values[:7] # skip incertae sedis taxa if id == '0': continue if syn_id and not 'synonym' in status: continue elif syn_id and 'synonym' in status: if tree_format == 'cdao': nodes[id] = ('synonym', name, syn_id) elif not syn_id: nodes[id] = BaseTree.Clade(name=name) nodes[id].parent_id = parent_id print 'Found %s OTUs.' % len(nodes) nodes[''] = root_node = BaseTree.Clade() # create tree from nodes dictionary print 'Building tree...' for node_id, this_node in nodes.iteritems(): if not node_id: continue if isinstance(this_node, BaseTree.Clade): try: parent_node = nodes[this_node.parent_id] parent_node.clades.append(this_node) del this_node.parent_id except (KeyError, AttributeError): pass elif this_node[0] == 'synonym': _, name, syn_id = this_node try: accepted_node = nodes[syn_id] except KeyError: continue if not isinstance(accepted_node, BaseTree.Clade): continue if not hasattr(accepted_node, 'tu_attributes'): nodes[syn_id].tu_attributes = [] nodes[syn_id].tu_attributes.append( ('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(name))) #print 'Synonym: %s -> %s' % (name, nodes[syn_id].name) tree = BaseTree.Tree(root=root_node) # write tree to file print 'Writing %s tree to %s...' % (tree_format, tree_filename) bp.write([tree], tree_filename, tree_format) print 'Done!' ''
def _create_tax(self): # https://en.wikipedia.org/wiki/Newick_format#Examples return Taxonomy.from_newick("(A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F;")