def fasta_stats(fastafile, display=False):
    # get basic stats on the fasta file:
    genelens = sorted( [ len(seq[1]) for seq in internal.parsefasta(fastafile)] , reverse=True)
    totallen = sum(genelens)

    print "Number of transcripts: %d" % len(genelens)
    print "Total length of transcripts: %d" % totallen
    print "longest transcript: %d" % genelens[0]

    if display:
        plt.hist(genelens)
        plt.title("Histogram of gene lengths (bp)")
        plt.show()
    prevl = 0
    lensum = 0
    trancount = 0
    for l in genelens:
        if lensum + l >= (totallen / 2):
            print "N50: %d (%d transcripts)" % (prevl, trancount)
            break
        lensum += l
        prevl = l
        trancount += 1
Example #2
0
def fasta_stats(fastafile, display=False):
    # get basic stats on the fasta file:
    genelens = sorted([len(seq[1]) for seq in internal.parsefasta(fastafile)],
                      reverse=True)
    totallen = sum(genelens)

    print "Number of transcripts: %d" % len(genelens)
    print "Total length of transcripts: %d" % totallen
    print "longest transcript: %d" % genelens[0]

    if display:
        plt.hist(genelens)
        plt.title("Histogram of gene lengths (bp)")
        plt.show()
    prevl = 0
    lensum = 0
    trancount = 0
    for l in genelens:
        if lensum + l >= (totallen / 2):
            print "N50: %d (%d transcripts)" % (prevl, trancount)
            break
        lensum += l
        prevl = l
        trancount += 1
    if args.stats and args.fasta:
        fasta_stats(args.fasta, args.display_on)

    #############################################################################
    ### extract sequence information from each gene in supplied transdecoder file
    transcript_dic = {}
    gene_families = {}
    seq_families = {}
    geneid_idx = {}
    gf_idx = {}


    full_blast = get_full_blast_idx(args.blast)
    verbalise("Y", "Created full blast index with %d entries" % (len(full_blast)))

    for defline, seq in internal.parsefasta(args.transdecoder):
        # get trinity and transdecoder gene ids:
        tdid, trinityid = parse_defline(defline)

        # get any blast results:
        if tdid in full_blast:
            blastline = full_blast[tdid]
        else:
            blastline = None

        # create new transcript instance
        newtranscript = Transcript(trinity_id=trinityid,
                                    transdecoder_id=tdid,
                                    blastline=blastline,
                                    seq=seq)
Example #4
0
def get_similar_sequences(temp_dir,
                          buildhmmer=False,
                          fastafile=None,
                          specieslist={},
                          species=None,
                          genes=[],
                          dbpaths={},
                          mincollect=2,
                          globalthresh=0.2,
                          localthresh=0.8,
                          verbalise=lambda *a: None):

    # clean gene list type and content:
    if not isinstance(genes, list):
        genes = [genes]
    genes = [g for g in genes if g != '']

    # count genes provided:
    genelist_num, fasta_num = internal.count_genes(genes, fastafile)
    verbalise("Y",
              "Genelist size:%d\nFasta size:%d" % (genelist_num, fasta_num))

    # if fasta files are provided, create a temp fastafile to search against with hmmer:
    if fastafile:
        extra_file = os.path.join(temp_dir, "query_fasta")
        handle = open(extra_file, 'w')
        for defline, seq in internal.parsefasta(fastafile):
            handle.write(">%s\n%s\n" % (defline, seq))
        handle.close()

        extra_file_search = extra_file
    else:
        extra_file_search = None

    if genelist_num + fasta_num > 1:
        buildhmmer = True

    if buildhmmer:
        hmminput = os.path.join(temp_dir, "hmminput.fa")
        handle = open(hmminput, 'w')
        seqcount = 0
        verbalise("B",
                  "Extracting sequence data from %d peptides" % len(genes))

        for defline, seq, species in internal.get_gene_fastas(
                genes=genes,
                species=None,
                fastafile=fastafile,
                specieslist=specieslist,
                dbpaths=dbpaths):

            if seq:
                seqcount += 1
                fasta_seq = "%s\n%s\n" % (defline, seq)
                handle.write(fasta_seq)
        handle.close()
        if seqcount == 0:
            verbalise("R", "No gene sequences were found.")
            return {}
        # create alignment of input sequences:
        mafft_align1 = os.path.join(temp_dir, "mafft_align_input.fa")
        mafft_align(hmminput, mafft_align1)

        verbalise("B",
                  "Creating hidden markov model from %d sequences" % seqcount)
        # create hmmbuild model of alignment:
        hmmmodel = os.path.join(temp_dir, "hmmmodel.fa")
        open(hmmmodel, 'a').close()
        handle = os.popen(" ".join(
            ['hmmbuild --informat afa', hmmmodel, mafft_align1]))
        handle.close()

        homologlist = hmmer_search(None,
                                   specieslist,
                                   query_species=species,
                                   minthresh=localthresh,
                                   temp_dir=temp_dir,
                                   dbpaths=dbpaths,
                                   mincollect=mincollect,
                                   globalthresh=globalthresh,
                                   hmmfile=hmmmodel,
                                   verbalise=verbalise,
                                   extra_file_search=extra_file_search)

        os.remove(mafft_align1)
        os.remove(hmminput)

    else:
        verbalise("B", "Extracting sequence from %s" % genes)
        if not isinstance(genes, list):
            genes = [genes]
        # run phmmer on a single input gene/sequence:
        seq = ""
        for defline, seq, species in internal.get_gene_fastas(
                genes=genes,
                species=species,
                fastafile=fastafile,
                specieslist=specieslist,
                dbpaths=dbpaths):

            fasta_seq = "%s\n%s\n" % (defline, seq)
            verbalise("C", fasta_seq)

        if not seq:
            verbalise("R", "No genes sequences were found.")
            return {}
        ## phmmer all lpep files
        homologlist = hmmer_search(fasta_seq,
                                   specieslist,
                                   query_species=species,
                                   minthresh=localthresh,
                                   dbpaths=dbpaths,
                                   temp_dir=temp_dir,
                                   mincollect=mincollect,
                                   globalthresh=globalthresh,
                                   hmmfile=None,
                                   verbalise=verbalise,
                                   extra_file_search=extra_file_search)

    return homologlist
Example #5
0
    if args.stats and args.fasta:
        fasta_stats(args.fasta, args.display_on)

    #############################################################################
    ### extract sequence information from each gene in supplied transdecoder file
    transcript_dic = {}
    gene_families = {}
    seq_families = {}
    geneid_idx = {}
    gf_idx = {}

    full_blast = get_full_blast_idx(args.blast)
    verbalise("Y",
              "Created full blast index with %d entries" % (len(full_blast)))

    for defline, seq in internal.parsefasta(args.transdecoder):
        # get trinity and transdecoder gene ids:
        tdid, trinityid = parse_defline(defline)

        # get any blast results:
        if tdid in full_blast:
            blastline = full_blast[tdid]
        else:
            blastline = None

        # create new transcript instance
        newtranscript = Transcript(trinity_id=trinityid,
                                   transdecoder_id=tdid,
                                   blastline=blastline,
                                   seq=seq)
Example #6
0
def get_similar_sequences(temp_dir, buildhmmer=False, fastafile=None,
                        specieslist={}, species=None, genes=[], dbpaths={},
                        mincollect=2, globalthresh=0.2, localthresh=0.8,
                        verbalise=lambda *a: None):

    # clean gene list type and content:
    if not isinstance(genes, list):
        genes = [genes]
    genes = [ g for g in genes if g != '' ]

    # count genes provided:
    genelist_num, fasta_num = internal.count_genes(genes, fastafile)
    verbalise("Y", "Genelist size:%d\nFasta size:%d" % (genelist_num, fasta_num ))

    # if fasta files are provided, create a temp fastafile to search against with hmmer:
    if fastafile:
        extra_file = os.path.join(temp_dir, "query_fasta")
        handle = open(extra_file, 'w')
        for defline, seq in internal.parsefasta(fastafile):
            handle.write(">%s\n%s\n" % (defline, seq))
        handle.close()

        extra_file_search = extra_file
    else:
        extra_file_search = None

    if genelist_num + fasta_num > 1:
        buildhmmer = True

    if buildhmmer:
        hmminput = os.path.join(temp_dir, "hmminput.fa")
        handle = open(hmminput, 'w')
        seqcount = 0
        verbalise("B", "Extracting sequence data from %d peptides" % len(genes))

        for defline, seq, species in internal.get_gene_fastas(genes=genes,
                                                    species=None,
                                                    fastafile=fastafile,
                                                    specieslist=specieslist,
                                                    dbpaths=dbpaths):

            if seq:
                seqcount += 1
                fasta_seq = "%s\n%s\n" % (defline, seq)
                handle.write(fasta_seq)
        handle.close()
        if seqcount == 0:
            verbalise("R", "No gene sequences were found.")
            return {}
        # create alignment of input sequences:
        mafft_align1 = os.path.join(temp_dir, "mafft_align_input.fa")
        mafft_align(hmminput, mafft_align1)

        verbalise("B", "Creating hidden markov model from %d sequences" % seqcount)
        # create hmmbuild model of alignment:
        hmmmodel = os.path.join(temp_dir, "hmmmodel.fa")
        open(hmmmodel, 'a').close()
        handle = os.popen(" ".join(['hmmbuild --informat afa', hmmmodel, mafft_align1]))
        handle.close()

        homologlist = hmmer_search(None,
                                    specieslist,
                                    query_species=species,
                                    minthresh=localthresh,
                                    temp_dir=temp_dir,
                                    dbpaths=dbpaths,
                                    mincollect=mincollect,
                                    globalthresh=globalthresh,
                                    hmmfile=hmmmodel,
                                    verbalise=verbalise,
                                    extra_file_search=extra_file_search)

        os.remove(mafft_align1)
        os.remove(hmminput)

    else:
        verbalise("B", "Extracting sequence from %s" % genes)
        if not isinstance(genes, list):
            genes = [genes]
        # run phmmer on a single input gene/sequence:
        seq = ""
        for defline, seq, species in internal.get_gene_fastas(genes=genes,
                                                    species=species,
                                                    fastafile=fastafile,
                                                    specieslist=specieslist,
                                                    dbpaths=dbpaths):

            fasta_seq = "%s\n%s\n" % (defline, seq)
            verbalise("C", fasta_seq)

        if not seq:
            verbalise("R", "No genes sequences were found.")
            return {}
        ## phmmer all lpep files
        homologlist = hmmer_search(fasta_seq,
                                    specieslist,
                                    query_species=species,
                                    minthresh=localthresh,
                                    dbpaths=dbpaths,
                                    temp_dir=temp_dir,
                                    mincollect=mincollect,
                                    globalthresh=globalthresh,
                                    hmmfile=None,
                                    verbalise=verbalise,
                                    extra_file_search=extra_file_search)

    return homologlist
Example #7
0
        verbalise=verbalise)
    verbalise(
        "C", "Best hmmer score = %d" % max(v[1] for v in homologlist.values()))

    ######### Extract identified sequences from LNRP fasta files #########
    conv_handle = open(logfile[:-3] + 'name_conversion.txt', 'w')
    conv_dic = {}
    itercount = 0
    previousseq = ""
    seqdic = {}  # loaded up to remove duplicate sequences
    excluded_genes = config.make_a_list(args.exclude_genes)
    excluded_species = config.make_a_list(args.exclude_species)

    # place any sequences provided in the input into the seqdic
    if args.fasta:
        for defline, seq in internal.parsefasta(args.fasta):
            if sequence_filter(seq, args.maxlength, args.minlength):
                continue
            else:
                seqdic[seq] = defline

    for homolog in sorted(homologlist):
        # remove excluded genes before bothering to look up their sequence:
        searchname = internal.fix_leaky_pipes(homolog)
        if searchname in excluded_genes:
            continue
        if homologlist[homolog][0] in excluded_species:
            continue

        # extract sequences of remaining genes and add to conversion dictionary
        itercount += 1
Example #8
0
        verbalise=verbalise,
    )
    verbalise("C", "Best hmmer score = %d" % max(v[1] for v in homologlist.values()))

    ######### Extract identified sequences from LNRP fasta files #########
    conv_handle = open(logfile[:-3] + "name_conversion.txt", "w")
    conv_dic = {}
    itercount = 0
    previousseq = ""
    seqdic = {}  # loaded up to remove duplicate sequences
    excluded_genes = config.make_a_list(args.exclude_genes)
    excluded_species = config.make_a_list(args.exclude_species)

    # place any sequences provided in the input into the seqdic
    if args.fasta:
        for defline, seq in internal.parsefasta(args.fasta):
            if sequence_filter(seq, args.maxlength, args.minlength):
                continue
            else:
                seqdic[seq] = defline

    for homolog in sorted(homologlist):
        # remove excluded genes before bothering to look up their sequence:
        searchname = internal.fix_leaky_pipes(homolog)
        if searchname in excluded_genes:
            continue
        if homologlist[homolog][0] in excluded_species:
            continue

        # extract sequences of remaining genes and add to conversion dictionary
        itercount += 1