Esempio n. 1
0
def main(argv):
    args = vars(parser.parse_args())
    
    # read the input fasta_files to be mapped to create read2origin map
    fh = open(args["input_mapping"], "r")
    lines = fh.readlines()
    fh.close()
    
    # create preferred megan mapping
    ncbi_megan_map = {}
    with open(args["ncbi_megan_map"], 'r') as meganfile:
        for line in meganfile:
             fields = line.split("\t")
             fields = map(str.strip, fields)
             ncbi_megan_map[fields[0]] = fields[1]

    read2origin = {} # hash mapping input sequences to their original header
    
    # fill read2origin
    for l in lines:
       fields =l.split("\t")
       fields = map(str.strip, fields)
       read_name = fields[0]
       gi_line = fields[1].strip("\n")
       gi_line = gi_line.split("|")
       ncbi_id = re.sub("\.[0-9]+$", "", gi_line[3] )
       read2origin[fields[0]] = ncbi_id
    
    # Parse NCBI: summary table
    fh = open(args["sum_table"], "r")
    lines = fh.readlines()
    fh.close()
    
    ncbiID_to_taxaID = {} # ncbiID to NCBI Tree taxaID
    taxaID_to_taxa = {} # ncbiID to full taxa name
    for l in lines:
       fields = l.split("\t")
       fields = map(str.strip, fields)
       ncbi_id = re.sub("\.[0-9]+$", "", fields[0])
       tax_id = fields[3]
       tax = fields[5]
       if ncbi_id not in ncbiID_to_taxaID:
          ncbiID_to_taxaID[ncbi_id] = tax_id
       if tax_id not in taxaID_to_taxa:
          taxaID_to_taxa[tax_id] = tax
    
    # read functional and taxonomic table
    fh = open(args["ft_table"], "r")
    header = fh.readline()
    header = header.split("\t") # list of headers
    lines = fh.readlines()

    contig_to_orfs = {} # get a list of ORFs for a specific contig
    orfs_to_lca = {} # get the lca taxonomy

    for l in lines:
       fields = l.split("\t")
       # ORF_ID   ORF_length  start   end Contig_Name Contig_length   strand  ec  taxonomy    product
       orf_id = fields[0]
       contig = fields[4]
       lca = fields[8]
       if contig not in contig_to_orfs:
          contig_to_orfs[contig] = []
       contig_to_orfs[contig].append(orf_id)
       if orf_id not in orfs_to_lca:
          orfs_to_lca[orf_id] = lca
       # get original taxonomy
       # print taxaID_to_taxa[ncbiID_to_taxaID[read2origin[contig]]]
    
    # Build the LCA Star NCBI Tree
    print "Loading LCAStar:"
    lcastar = LCAStar(args["ncbi_tree"])
    print "Done."
    
    # set LCAStar parameters
    lcastar.setLCAStarParameters(min_depth = 1, alpha = 0.5, min_reads = 1 )
    
    # small helper to translate the list
    def get_orfs_taxa(orfs):
        list = []
        for o in orfs:
            list.append(orfs_to_lca[o])
        return list
    output = open(args["output"], "w") 
    header = "\t".join(["contig","real","taxa","method","dist", "wtd", "real_linage"])
    output.write(header + "\n")
    for c in contig_to_orfs:
        # the read taxa
        contig = c
        real = taxaID_to_taxa[ncbiID_to_taxaID[read2origin[c]]]
        taxa_list = get_orfs_taxa(contig_to_orfs[c])
        lca_list = []
        for t in taxa_list:
             lca_list.append([ t ])
        taxon = lcastar.lca_star(taxa_list)
        real_lineage = lcastar.get_lineage(lcastar.get_a_Valid_ID([real]))
        lineage = []
        for id in real_lineage:
            lineage.append(translate_to_prefered_name(id, ncbi_megan_map, lcastar))
        real_lineage = lineage
        line = "\t".join([contig, real, translate_to_prefered_name(lcastar.get_a_Valid_ID([taxon]),ncbi_megan_map, lcastar), "LCA_Star", str(lcastar.get_distance(taxon, real)), str(lcastar.wtd_distance(real, taxon)), ";".join(real_lineage[::-1])])
        output.write(line + "\n")
        taxon = lcastar.lca_majority(taxa_list) 
        line = "\t".join([contig, real, translate_to_prefered_name(lcastar.get_a_Valid_ID([taxon]),ncbi_megan_map, lcastar), "Majority", str(lcastar.get_distance(taxon, real)), str(lcastar.wtd_distance(real, taxon)), ";".join(real_lineage[::-1])])
        output.write(line + "\n")
        taxon = lcastar.getTaxonomy(lca_list) 
        line = "\t".join([contig, real, translate_to_prefered_name(lcastar.get_a_Valid_ID([taxon]),ncbi_megan_map, lcastar), "LCA_Squared", str(lcastar.get_distance(taxon, real)), str(lcastar.wtd_distance(real, taxon)), ";".join(real_lineage[::-1])])
        output.write(line + "\n")

    output.close()
Esempio n. 2
0
def main(argv):
    # parse arguments
    args = vars(parser.parse_args())

    ## read input mapping file (.mapping.txt) to create read2origin map
    contig2origin = {
    }  # dictionary/hash table mapping input sequences to their original header, e.g. >[Header]
    contig2origin = create_contig2origin(args["mapping_file"])

    ## create preferred mapping
    ncbi_megan_map = {
    }  # hash map from given taxonomy to preferred one used by megan
    with open(args["ncbi_megan_map"], 'r') as meganfile:
        for line in meganfile:
            fields = line.split("\t")
            fields = map(str.strip, fields)
            ncbi_megan_map[fields[0]] = fields[1]

    # contig_taxa_data structure
    contig_to_taxa = {}

    # Read blast table
    with open(args["parsed_blast"], "r") as fh:
        for l in fh:
            if re.match("^\#", l):
                clean_tab_lines(l)
            else:
                fields = clean_tab_lines(l)
                contig_hits = contig_pattern.search(fields[0])
                if contig_hits:
                    contig = contig_hits.group(1)
                    orf = contig_hits.group(2)
                    # add to data structre if it doesn't exist
                    if contig not in contig_to_taxa:
                        contig_to_taxa[contig] = {}
                    if orf not in contig_to_taxa[contig]:
                        contig_to_taxa[contig][orf] = []
                    # pull taxonomy out of annotation
                    taxa_hits = taxonomy_pattern.search(fields[9])
                    if taxa_hits:
                        taxa = taxa_hits.group(1)
                        bitscore = fields[3]
                        contig_to_taxa[contig][orf].append(
                            (taxa, float(str(bitscore))))
                    else:
                        continue
                else:
                    continue

    ## Load contig references (if available or applicable)

    # read contig taxa reference if available
    contig_to_taxa_ref = None
    if args["contig_taxa_ref"]:
        contig_to_taxa_ref = {}
        if args["contig_taxa_ref"]:
            with open(args["contig_taxa_ref"], "r") as fh:
                for l in fh:
                    fields = clean_tab_lines(l)
                    contig_id = fields[0]
                    contig_origin = fields[1]
                    contig_to_taxa_ref[contig_id] = contig_origin

    # all contigs hypothetically have the same reference origin (i.e., single cells)
    sample_ref = None
    if args["sample_taxa_ref"]:
        sample_ref = args["sample_taxa_ref"]

    ## Build the LCA Star NCBI Tree
    lcastar = LCAStar(args["ncbi_tree"])
    lcastar.setLCAStarParameters(min_depth=1, alpha=args["alpha"], min_reads=1)

    ## Calculate LCA for each ORF
    contig_to_lca = {}
    for contig in contig_to_taxa:
        for orf in contig_to_taxa[contig]:
            if contig not in contig_to_lca:
                contig_to_lca[contig] = {}
            if orf not in contig_to_lca[contig]:
                contig_to_lca[contig][orf] = None
            contig_taxas = contig_to_taxa[contig][orf]
            if len(contig_taxas) == 0:
                contig_to_lca[contig][orf] = "root"
            else:
                if args['orf_summary'] == 'besthit':
                    contig_taxas.sort(key=operator.itemgetter(1), reverse=True)
                    best_blast_taxa = contig_taxas[0][0]
                    contig_to_lca[contig][orf] = best_blast_taxa
                elif args['orf_summary'] == 'orf_majority':
                    majority_list = []
                    for t in contig_taxas:
                        majority_list.append(t[0])
                    #TODO Update to check for alterantive taxonomy names
                    contig_to_lca[contig][orf] = lcastar.simple_majority(
                        majority_list)
                else:
                    lca_list = []  # create a list of lists for LCA calculation
                    for t in contig_taxas:
                        lca_list.append([t[0]])
                    contig_to_lca[contig][orf] = lcastar.getTaxonomy(lca_list)

    ## calculate taxonomy statistics LCA,  for each ORF
    # contig_to_taxa = {}

    ## LCA^2, Majority, and LCA* for each ORF
    writeout(args, contig_to_lca, contig_to_taxa_ref, sample_ref, lcastar,
             ncbi_megan_map)
Esempio n. 3
0
def main(argv):
    # parse arguments
    args = vars(parser.parse_args())

    ## read input mapping file (.mapping.txt) to create read2origin map
    contig2origin = {} # dictionary/hash table mapping input sequences to their original header, e.g. >[Header]
    contig2origin = create_contig2origin(args["mapping_file"])

    ## create preferred mapping
    ncbi_megan_map = {} # hash map from given taxonomy to preferred one used by megan
    with open(args["ncbi_megan_map"], 'r') as meganfile:
        for line in meganfile:
             fields = line.split("\t")
             fields = map(str.strip, fields)
             ncbi_megan_map[fields[0]] = fields[1]

    # contig_taxa_data structure
    contig_to_taxa = {}

    # Read blast table
    with open(args["parsed_blast"], "r") as fh:
        for l in fh:
            if re.match("^\#", l):
                clean_tab_lines(l)
            else:
                fields = clean_tab_lines(l)
                contig_hits = contig_pattern.search(fields[0])
                if contig_hits:
                    contig = contig_hits.group(1)
                    orf = contig_hits.group(2)
                    # add to data structre if it doesn't exist
                    if contig not in contig_to_taxa:
                        contig_to_taxa[contig] = {}
                    if orf not in contig_to_taxa[contig]:
                        contig_to_taxa[contig][orf] = []
                    # pull taxonomy out of annotation
                    taxa_hits = taxonomy_pattern.search(fields[9])
                    if taxa_hits:
                        taxa = taxa_hits.group(1)
                        bitscore = fields[3]
                        contig_to_taxa[contig][orf].append( (taxa, float(str(bitscore))) )
                    else:
                        continue
                else:
                    continue

    ## Load contig references (if available or applicable)

    # read contig taxa reference if available
    contig_to_taxa_ref = None
    if args["contig_taxa_ref"]:
        contig_to_taxa_ref = {}
        if args["contig_taxa_ref"]:
            with open(args["contig_taxa_ref"], "r") as fh:
                for l in fh:
                    fields = clean_tab_lines(l)
                    contig_id = fields[0]
                    contig_origin = fields[1]
                    contig_to_taxa_ref[contig_id] = contig_origin

    # all contigs hypothetically have the same reference origin (i.e., single cells)
    sample_ref = None
    if args["sample_taxa_ref"]:
        sample_ref = args["sample_taxa_ref"]

    ## Build the LCA Star NCBI Tree
    lcastar = LCAStar(args["ncbi_tree"])
    lcastar.setLCAStarParameters(min_depth = 1, alpha = args["alpha"], min_reads = 1)

    ## Calculate LCA for each ORF
    contig_to_lca = {}
    for contig in contig_to_taxa:
        for orf in contig_to_taxa[contig]:
            if contig not in contig_to_lca:
                contig_to_lca[contig] = {}
            if orf not in contig_to_lca[contig]:
                contig_to_lca[contig][orf] = None
            contig_taxas = contig_to_taxa[contig][orf]
            if len(contig_taxas) == 0:
                contig_to_lca[contig][orf] = "root"
            else:
                if args['orf_summary'] == 'besthit':
                    contig_taxas.sort(key=operator.itemgetter(1), reverse=True)
                    best_blast_taxa = contig_taxas[0][0]
                    contig_to_lca[contig][orf] = best_blast_taxa
                elif args['orf_summary'] == 'orf_majority':
                    majority_list = []
                    for t in contig_taxas:
                        majority_list.append(t[0])
                    #TODO Update to check for alterantive taxonomy names
                    contig_to_lca[contig][orf] = lcastar.simple_majority(majority_list)
                else:
                    lca_list = [] # create a list of lists for LCA calculation
                    for t in contig_taxas:
                        lca_list.append([t[0]])
                    contig_to_lca[contig][orf] = lcastar.getTaxonomy(lca_list)

    ## calculate taxonomy statistics LCA,  for each ORF
    # contig_to_taxa = {}

    ## LCA^2, Majority, and LCA* for each ORF    
    writeout(args, contig_to_lca, contig_to_taxa_ref, sample_ref, lcastar, ncbi_megan_map)
def main():
    args = vars(parser.parse_args())

    # parse options
    parsed_blast = args["parsed_blast"][0]
    pwy_long = args["pwy_long"][0]
    ncbi_tree = args["ncbi_tree"][0]
    megan_names = args["megan_names"][0]
    output_file = args["output_file"][0]
    lca = args["lca"]
    rpkm_file = args["rpkm"][0]

    # parse RPKM value for every ORF
    if rpkm_file:
        orf_id_to_rpkm = {}
        orf_id_to_rpkm = process_rpkm_file(rpkm_file, orf_id_to_rpkm)

    # Load NCBI Tree and LCA Star object
    global lcastar
    lcastar = LCAStar(ncbi_tree, megan_names)

    # LCA star parameters
    alpha = 0.51
    min_reads = 0
    lcastar.setLCAStarParameters(0, alpha, min_reads)
    print 'Done initializing NCBI Tree'

    # capture taxonomic annotation from parsed B/LAST file
    blast_to_taxonomy = {}  # map from read to taxonomy
    blast_to_taxonomy = read_and_clean_parsed_blast(parsed_blast,
                                                    blast_to_taxonomy, lca)

    # read through pathway file and append taxonomy field if hit occurs:
    header = [
        "SAMPLE", "PWY_NAME", "PWY_COMMON_NAME", "RXN_NAME", "RXN_COMMON_NAME",
        "NUM_REACTIONS", "NUM_COVERED_REACTIONS", "ORF_COUNT", "ORF",
        "TAXONOMY"
    ]
    if orf_id_to_rpkm:
        header.append("RPKM")
    if ".gz" in pwy_long:
        with gzip.open(output_file + ".gz", 'w') as fh_out:
            fh_out.write("\t".join(header) + "\n")
            with gzip.open(pwy_long, 'rb') as fh:
                fh.readline()  # read out old header
                for line in fh:
                    fields = line.split("\t")
                    orf_id = fields[-1].strip("\n")
                    fields[-1] = fields[-1].strip("\n")
                    if orf_id in blast_to_taxonomy:
                        fields.append(blast_to_taxonomy[orf_id])
                    else:
                        fields.append("NA")
                    if orf_id_to_rpkm:
                        if orf_id in orf_id_to_rpkm:
                            fields.append(orf_id_to_rpkm[orf_id])
                        else:
                            fields.append("NA")
                    fh_out.write("\t".join(fields) + "\n")
    else:
        with open(output_file + ".gz", 'wb') as fh_out:
            fh_out.write("\t".join(header) + "\n")
            with open(pwy_long, 'rb') as fh:
                fh.readline()  # read out old header
                for line in fh:
                    fields = line.split("\t")
                    orf_id = fields[-1].strip("\n")
                    fields[-1] = fields[-1].strip("\n")
                    if orf_id in blast_to_taxonomy:
                        fields.append(blast_to_taxonomy[orf_id])
                    else:
                        fields.append("NA")
                    fh_out.write("\t".join(fields) + "\n")

    exit()