def main():
    import argparse
    usage   = "%(prog)s -v" #usage=usage, 
    parser  = argparse.ArgumentParser(description=desc, epilog=epilog, \
                                      formatter_class=argparse.RawTextHelpFormatter)
  
    parser.add_argument('--version', action='version', version='1.0b')   
    parser.add_argument("-v", "--verbose", default=False, action="store_true",
                        help="verbose")
    parser.add_argument('-t', '--taxids', nargs="+", type=int,
                        help="group taxid(s)    [%(default)s]")
    parser.add_argument("--taxadb",        default="/users/tg/lpryszcz/cluster/rapsi/taxonomy.db3",
                        help="taxonomy path  [%(default)s]")

    o = parser.parse_args()
    if o.verbose:
        sys.stderr.write("Options: %s\n"%str(o))
        
    #init taxonomy
    taxa = Taxonomy(o.taxadb)

    #init metaphors connection
    cur = _getConnection()
    cur.execute("select taxid, name from species")
    species = {}
    for taxid, name in cur.fetchall():
        species[taxid] = (taxid, name)
    
    if o.verbose:
        sys.stderr.write("%s species in database\n"%len(species))
        
    #process taxa groups
    for taxid in o.taxids:
        #fetch proteins from given taxa
        taxid2proteomes(cur, species, taxa, taxid, o.verbose)
Beispiel #2
0
def main():
    import argparse
    usage   = "%(prog)s -v" #usage=usage, 
    parser  = argparse.ArgumentParser(description=desc, epilog=epilog, \
                                      formatter_class=argparse.RawTextHelpFormatter)
  
    parser.add_argument('--version', action='version', version='1.0b')   
    parser.add_argument("-v", "--verbose", default=False, action="store_true",
                        help="verbose")
    parser.add_argument('-d', '--db', default="metaphors_201405",
                        help="database name     [%(default)s]")
    parser.add_argument('-t', '--taxids', nargs="+", type=int,
                        help="group taxid(s)    [%(default)s]")
    parser.add_argument("--taxadb",        default="/users/tg/lpryszcz/cluster/rapsi/taxonomy.db3",
                        help="taxonomy path  [%(default)s]")

    o = parser.parse_args()
    if o.verbose:
        sys.stderr.write("Options: %s\n"%str(o))
        
    #init taxonomy
    taxa = Taxonomy(o.taxadb)

    #init metaphors connection
    m  = dbClient.metaphors(o.db)
    if o.verbose:
        sys.stderr.write("%s species in %s database\n"%(len(m.species), o.db))
        
    #process taxa groups
    for taxid in o.taxids:
        #fetch proteins from given taxa
        taxid2proteomes(m, taxa, taxid, o.verbose)
def read_taxonomy(tax_f):
    root = TNode('*', [])
    tax = Taxonomy(tax_f, root)

    with open(tax_f) as f:
        for line in f:
            node_name, ph_str = line.strip('\r\n').split('\t')
            node = TNode(node_name, ph_str.split(','))
            tax.add_node(node)

    return tax
Beispiel #4
0
    def load_from_json(cls, name):
        """
        Loads a taxonomy object from the given json file and name.
        """

        name_with_extension = str(name) + ".json"
        file_name = "taxonomies/" + name_with_extension

        with open(file_name) as file_object:
            graph_json = json.load(file_object)

        graph = json_graph.node_link_graph(graph_json)
        return Taxonomy(graph)
Beispiel #5
0
    def read(self, parsed_taxonomies):
        """
        Read in taxonomies for a given code table.

        params:
            taxonomies (dict{id: Taxonomy})
        """
        for key, taxonomy in parsed_taxonomies.iteritems():
            synonym_phrases = [
                Phrase(synonym) for synonym in taxonomy.synonyms
            ]

            head_phrase = Phrase(taxonomy.head)
            self.taxonomies[key] = Taxonomy(key, head_phrase, synonym_phrases)
Beispiel #6
0
    def taxonomies_to_str(self):
        """
        Convert taxonomies to strings, to be written to a file.

        returns:
            taxonomies_as_str {dict:Taxonomy(str)}
        """
        taxonomies_as_str = {}
        for key, taxonomy in self.taxonomies.iteritems():
            taxonomy_head = taxonomy.head.raw_form
            synonyms = [synonym.raw_form for synonym in taxonomy.synonyms]

            taxonomies_as_str[key] = Taxonomy(key, taxonomy_head, synonyms)

        return taxonomies_as_str
Beispiel #7
0
def find_cites(contribs):
    tax = Taxonomy()
    successes = 0
    failures = 0
    citations = []
    for contrib in contribs:
        row = None
        if "Rationale" in contrib.keys():
            row = contrib["Rationale"]
        if "Text" in contrib.keys():
            row = contrib["Text"]
        if row is not None:
            cites = search_for_citations(tax, row)
            entry = {"ID": contrib["ID"], "Citations": cites}
            citations.append(entry)
            successes += 1
        else:
            failures += 1
    return citations
Beispiel #8
0
def hits2taxa(input, out, db, verbose, limit=0):
    """Process fastq from input file.
    You may play with bufsize, so processes runs without waiting.
    """
    #init taxonomy
    taxa = Taxonomy(db)

    #hadle gzipped/bzip2 stream
    if input.name.endswith('.gz'):
        input = gzip.open(input.name)
    elif input.name.endswith('.bz2'):
        import bz2
        input = bz2.BZ2File(input.name)
    #get match generator
    if input == sys.stdin:
        line0 = input.readline()
        if line0.startswith('@'):
            mGenerator = get_matches_sam(input, verbose)
        else:
            mGenerator = get_matches_blast8(input, verbose)
    #get sam stream
    elif input.name.endswith(('.sam', '.sam.gz')):
        mGenerator = get_matches_sam(input, verbose)
    else:
        mGenerator = get_matches_blast8(input, verbose)

    #process reads in 1K batches?print
    if verbose:
        sys.stderr.write("[%s] Processing reads from %s ...\n" %
                         (datetime.ctime(datetime.now()), input.name))
    #get taxa and genes
    taxid2reads = {}
    taxid2matches = {}
    k = 0
    for i, (rname, hits) in enumerate(mGenerator, 1):
        if limit and i > limit:
            break
        if not rname:
            continue
        #print info
        if verbose and i % 1e4 == 1:
            sys.stderr.write(" %s parsed. %.2f%s with taxa  \r" %
                             (i, k * 100.0 / i, '%'))
        #get taxa
        taxid, matches = get_taxa(hits, taxa, verbose)
        if not taxid:
            continue
        k += 1
        if taxid not in taxid2reads:
            taxid2reads[taxid] = 0
        #store read name & genes
        taxid2reads[taxid] += 1

    #report
    if not taxid2reads:
        sys.exit("No matches found!")
    ##foreign reads
    freads = sum(reads for taxid, reads in taxid2reads.iteritems())
    header = "#name\ttaxid\treads\t%\n"
    out.write(header)
    out.write("%s\t%s\t%s\t%.2f\n" % ("unknown", "-", i - freads, 100.0 *
                                      (i - freads) / i))
    for taxid, reads in sorted(taxid2reads.iteritems(),
                               key=lambda x: x[1],
                               reverse=True)[:10]:
        out.write("%s\t%s\t%s\t%.2f\n" %
                  (taxa[taxid][1], taxid, reads, 100.0 * reads / i))
    #print summary
    sys.stderr.write("[hits2taxa] %s entries processed!\n" % (i, ))
Beispiel #9
0
NOUNSET_BANK = NounSetBank(DATA_DIR + 'nounsets.yml')
NOUN_FORMS = {
    lang: NounFormBank(DATA_DIR + 'nouns_{}.yml'.format(lang))
    for lang in LANGUAGES
}

PREPSET_BANK = PrepositionSetBank(DATA_DIR + 'prepsets.yml')

PRONSET_BANK = PronounSetBank(DATA_DIR + 'pronsets.yml')
PRONOUN_FORMS = {
    lang: PronounFormBank(DATA_DIR + 'prons_{}.yml'.format(lang))
    for lang in LANGUAGES
}

TAXONOMY = Taxonomy(DATA_DIR + 'taxonomy.yml')

#VERBSET_BANK = VerbSetBank(DATA_DIR + 'verbsets/')
VERBSET_BANK = VerbSetBank(DATA_DIR + 'verbsets.yml')
VERB_FORMS = {
    lang: VerbFormBank(DATA_DIR + 'verbs_{}.yml'.format(lang))
    for lang in LANGUAGES
}

TEMPLATE_DIR = DATA_DIR + 'templates/'
ADJP_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'adjp_templates.yml')
ADVP_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'advp_templates.yml')
CLAUSE_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'clause_templates.yml')
#CUSTOM_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'custom_templates.yml')
CUSTOM_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'custom_postedited.yml')
NP_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'np_templates.yml')
Beispiel #10
0
#
# TODO: list-types is missing
#

INFO = """
This the CLI for the Orange Button Core library.  Information is available at the following URL's:

Orange Button Overview: https://sunspec.org/orange-button-initiative/
Orange Button GitHUb: https://github.com/SunSpecOrangeButton
Orange Button CLI GitHub: https://github.com/SunSpecOrangeButton/core
"""

DASHES = "---------------------------------------------------------------------------------------"

taxonomy = Taxonomy()
csv = False
json = False
xml = False


def info(args):
    print(INFO)


def convert(args):

    p = Parser(taxonomy)

    ff = None
    if json:
# 根据筛选出来的信息   实例化Page和Taxonomy对象
page = tk.result_list_page
tax = tk.result_list_tax



# 目标对象列表
aim_page = []
aim_tax = []

for item in page: #实例化页面
    aim_page.append(Page(item = item))
    
for item in tax:  #实例化分类
    aim_tax.append(Taxonomy(tax_name=item['type'],item=item))
    



# -----------------------------------      小艾       ------------------------------------------
xiaoai = XiaoAi()

# 将任务加入小艾的队列
for item in aim_page:
    xiaoai.add_task(item)

for item in aim_tax:
    xiaoai.add_task(item)

# 开始执行自动化生成
Beispiel #12
0
    def assign_taxonomy(self, key, output_dir, dna_region, names_file,
                        ref_taxa):

        from taxonomy import Taxonomy, consensus
        #results = uc_results
        results = {}

        try:
            self.runobj.run_status_file_h.write(
                json.dumps({'status': "STARTING_ASSIGN_TAXONOMY: " + key}) +
                "\n")
        except:
            pass
        #test_read='FI1U8LC02GEF7N'
        # open gast_file to get results
        "to Dirs"
        tagtax_terse_filename = os.path.join(output_dir, "tagtax_terse")
        tagtax_long_filename = os.path.join(output_dir, "tagtax_long")
        tagtax_terse_fh = open(tagtax_terse_filename, 'w')
        tagtax_long_fh = open(tagtax_long_filename, 'w')
        tagtax_long_fh.write("\t".join([
            "read_id", "taxonomy", "distance", "rank", "refssu_count", "vote",
            "minrank", "taxa_counts", "max_pcts", "na_pcts", "refhvr_ids"
        ]) + "\n")
        gast_file = os.path.join(output_dir, "gast" + dna_region)
        if not os.path.exists(gast_file):
            logging.info("gast:assign_taxonomy: Could not find gast file: " +
                         gast_file + ". Returning")
            return results

        for line in open(gast_file, 'r'):
            # must split on tab because last field may be empty and must be maintained as blank
            data = line.strip().split("\t")
            if len(data) == 3:
                data.append("")
            # 0=id, 1=ref, 2=dist, 3=align 4=frequency
            #if data[0]==test_read:
            #    print 'found test in gastv6 ', data[1].split('|')[0], data[2], data[3]

            read_id = data[0]
            if read_id in results:
                results[read_id].append(
                    [data[1].split('|')[0], data[2], data[3], data[4]])
            else:
                results[read_id] = [[
                    data[1].split('|')[0], data[2], data[3], data[4]
                ]]

        for line in open(names_file, 'r'):
            data = line.strip().split("\t")
            dupes = data[1].split(",")
            read_id = data[0]
            taxObjects = []
            distance = 0
            frequency = 0
            refs_for = {}

            #print 'read_id', read_id
            'assing taxonomyt method, either fake or real'
            if read_id not in results:
                results[read_id] = [
                    "Unknown", '1', "NA", '0', '0', "NA", "0;0;0;0;0;0;0;0",
                    "0;0;0;0;0;0;0;0", "100;100;100;100;100;100;100;100"
                ]
                refs_for[read_id] = ["NA"]
            else:
                'it is in results[]'
                #print 'read_id in res', read_id, results[read_id]
                #if read_id == test_read_id:
                #    print 'found ', test_read_id, results[test_read_id]
                for i in range(0, len(results[read_id])):
                    #for resultread_id in results[read_id]:
                    #print 'resread_id', results[read_id]
                    ref = results[read_id][i][0]
                    if ref in ref_taxa:
                        for tax in ref_taxa[ref]:
                            for t in tax:
                                taxObjects.append(Taxonomy(t))
                    else:
                        pass

                    if read_id in refs_for:
                        #if read_id ==test_read_id:
                        #    print '2', read_id, refs_for[test_read_id]
                        if results[read_id][i][0] not in refs_for[read_id]:
                            refs_for[read_id].append(results[read_id][i][0])
                    else:
                        #if read_id == test_read_id:
                        #    print '1', read_id, results[read_id][i][0]
                        refs_for[read_id] = [results[read_id][i][0]]

                    # should all be the same distance for the duplicates
                    distance = results[read_id][i][1]
                    frequency = results[read_id][i][3]
                #Lookup the consensus taxonomy for the array
                taxReturn = consensus(taxObjects, C.majority)

                # 0=taxObj, 1=winning vote, 2=minrank, 3=rankCounts, 4=maxPcts, 5=naPcts;
                taxon = taxReturn[0].taxstring()
                #if taxon[-3:] = ';NA':
                #    taxon = taxon[:-3]
                #tax_counter[taxon]
                rank = taxReturn[0].depth()
                #print read_id, taxon, rank, taxReturn[0], taxReturn[1]
                if not taxon: taxon = "Unknown"

                # (taxonomy, distance, rank, refssu_count, vote, minrank, taxa_counts, max_pcts, na_pcts)
                results[read_id] = [
                    taxon,
                    str(distance), rank,
                    str(len(taxObjects)),
                    str(taxReturn[1]), taxReturn[2], taxReturn[3],
                    taxReturn[4], taxReturn[5]
                ]
                #print "\t".join([read_id, taxon, str(distance), rank, str(len(taxObjects)), str(taxReturn[1]), taxReturn[2], taxReturn[3], taxReturn[4], taxReturn[5]]) + "\n"
#read_id_id taxonomy        distance        rank    refssu_count    vote    minrank taxa_counts     max_pcts        na_pcts refhvr_ids
#D4ZHLFP1:25:B022DACXX:3:1101:12919:40734 1:N:0:TGACCA|frequency:162     Bacteria;Proteobacteria;Gammaproteobacteria     0.117   class   2       100     genus   1;1;1;2;2;2;0;0 100;100;100;50;50;50;0;0        0;0;0;0;0;0;100;100     v6_CI671
#D4ZHLFP1:25:B022DACXX:3:1101:10432:76870 1:N:0:TGACCA|frequency:105     Bacteria;Proteobacteria;Gammaproteobacteria     0.017   class   1       100     class   1;1;1;0;0;0;0;0 100;100;100;0;0;0;0;0   0;0;0;100;100;100;100;100       v6_BW306

# Replace hash with final taxonomy results, for each copy of the sequence
            for d in dupes:
                # print OUT join("\t", $d, @{$results{$read_id}}, join(", ", sort @{$refs_for{$read_id}})) . "\n";
                d = d.strip()
                tagtax_long_fh.write(d + "\t" + "\t".join(results[read_id]) +
                                     "\t" +
                                     ', '.join(sorted(refs_for[read_id])) +
                                     "\n")
                tagtax_terse_fh.write(d + "\t" + results[read_id][0] + "\t" +
                                      results[read_id][2] + "\t" +
                                      results[read_id][3] + "\t" +
                                      ', '.join(sorted(refs_for[read_id])) +
                                      "\t" + results[read_id][1] + "\t" +
                                      str(frequency) + "\n")

        tagtax_terse_fh.close()
        tagtax_long_fh.close()
        return results
Beispiel #13
0
def main():
    logger.warning("Start building taxonomy")
    # Load input: this includes reading network, text, and
    # a background corpus for contrastive analysis
    logger.info("Loading graph from file")
    A, node_info = utils.load_graph(args.data_dir,
                                    remove_citation=True,
                                    force_undirected=True)
    logger.info("Create HIN")
    G = HIN(A, node_info)

    logger.info("Load text")
    corpus = utils.load_documents(args.data_dir)

    motif_matchers = [
        Motif_KPV(),
        Motif_KPA(),
        Motif_KP(),
        Motif_KPVY(),
        Motif_KPAA()
    ]

    intermediate_dir = plib.Path(args.data_dir, "intermediate")
    if not intermediate_dir.is_dir():
        logger.warning(f"Creating intermediate dir {intermediate_dir}")
        intermediate_dir.mkdir(parents=False)

    # we collect all phrases
    T = []  # terms / phrases
    for info in node_info.values():
        if info.node_type == "K":
            T.append(info.entity_id)

    D = corpus
    tf_bg, idf_bg = utils.get_tf_idf_from_file(
        plib.Path(args.data_dir, "background_documents.txt"), T)

    taxo = Taxonomy(D, T, G)

    builder = NetTaxo(motif_matchers,
                      tf_lift=args.tf_lift,
                      idf_lift=args.idf_lift,
                      damping=args.damping,
                      conf_motif=Motif_KPA().motif_name)

    # set background corpus for contrastive analysis
    builder.set_background(tf_bg, idf_bg)
    builder.build(taxo, args.levels)

    # save
    output_dir = plib.Path(args.output_dir, config.unique_id)
    if not output_dir.is_dir():
        output_dir.mkdir(parents=True)
    logger.info(f"Saving to {output_dir}")
    taxo.save(output_dir)

    logger.info("Saving complete")

    # generate output
    taxo.visualize(plib.Path(output_dir, f"vis.pdf"))
    taxo.save_readable(output_dir)