Ejemplo n.º 1
0
def get_sw_scores():
    res = {}
    with open('data/scores.sw') as f:
        for line in f:
            line = line.strip()
            if line.startswith('query:'):
                cur_prot = line[6:]
                res[cur_prot] = {}
            elif line.startswith('score: '):
                it = line[7:].split(' -- ')
                res[cur_prot][it[1]] = int(it[0])
    p = list(res.keys())
    s = list()
    for i in range(len(p)):
        scores = np.zeros((len(p), ), dtype=np.float32)
        for j in range(len(p)):
            norm = max(res[p[i]][p[i]], res[p[j]][p[j]])
            scores[j] = res[p[i]][p[j]] / norm
        s.append(scores)

    df = pd.DataFrame({'proteins': p, 'scores': s})

    prots, sequences = read_fasta(open('data/swissprot.fasta', 'r'))
    prots_dict = {}
    for prot, seq in zip(prots, sequences):
        prots_dict[prot] = seq

    sequences = list()
    for prot in p:
        sequences.append(prots_dict[prot])
    df['sequences'] = sequences

    df.to_pickle('data/sw_scores.pkl')
            
    return res
Ejemplo n.º 2
0
def pick_long_reads(files, length, outputfile, args):
    totalcount = 0
    passedcount = 0

    for fastafile in files:
        for header, seq in read_fasta(fastafile, False):
            totalcount = totalcount + 1
            if len(seq) >= length:
                longread = True
                if args.cut_stars:
                    seq = seq.split('*')
                    tmp_length = 0
                    for i in range(0, len(seq)):
                        if len(seq[i]) > tmp_length:
                            tmp_length = len(seq[i])
                            j = i
                    if tmp_length < length:
                        print header, tmp_length
                        longread = False
                    seq = seq[j]
                if longread:
                    passedcount = passedcount + 1
                    if header.startswith("Contig"):
                        filename = (fastafile.split("/")[-1]).split(".")[0]
                        header = filename + "_" + header
                    outputfile.write('>%s\n%s\n' % (header, seq))

    outputfile.close()
    print 'searched %s sequences.\n \
            %s of them were longer than %s' \
            %(totalcount,passedcount,length)
Ejemplo n.º 3
0
def build_index(args):
    i_file = args.input

    utils.logging("[INFO] Start downloading reference file.", args)
    tempbase = utils.gen_file()
    utils.mkdir(tempbase)
    reffile = os.path.join(tempbase, "raw.fa")
    utils.read_hdfs(i_file, reffile)

    tempfiles = [
        open(os.path.join(tempbase, "%s.fa" % m), 'w') for m in conv_way
    ]

    utils.logging("[INFO] Start transforming reference file.", args)
    # read ref
    for chrid, seq in utils.read_fasta(reffile):
        for i, method in enumerate(conv_way):
            (strand, a_from, a_to) = (method[0], method[2], method[4])

            if strand == "W":
                tempfiles[i].write(
                    ">%s\n%s\n" %
                    (chrid,
                     seq.translate(utils.make_trans_with(strand, a_from,
                                                         a_to))))
            else:
                tempfiles[i].write(
                    ">%s\n%s\n" %
                    (chrid,
                     seq.translate(utils.make_trans_with(strand, a_from,
                                                         a_to))[::-1]))

    # close all files
    for i, method in enumerate(conv_way):
        tempfiles[i].close()

    utils.logging("[INFO] Start launching bowtie2-build.", args)
    # run bowtie jobs
    procs = []

    utils.mkdir(os.path.join(tempbase, "index"))
    for i, method in enumerate(conv_way):
        out_pref = os.path.join(tempbase, "index", method)
        build_log = out_pref + ".build.log"

        proc = Process(target=call_bowtie,
                       args=(
                           tempfiles[i].name,
                           out_pref,
                           build_log,
                       ))
        procs.append(proc)
        proc.start()

    for proc in procs:
        proc.join()

    utils.logging("[INFO] Start uploading index file.", args)
    # move to hdfs
    utils.copy_to_hdfs(tempbase, args.output, remove_original=True)
Ejemplo n.º 4
0
def align(sc, args):
    import utils as g_utils
    import align_utils as a_utils

    ## broadcast raw reference
    ref_file = os.path.join(args.tempbase, "ref.fa")
    g_utils.read_hdfs(os.path.join(args.ref, "raw.fa"), ref_file)
    ref_dict = {}
    for chrid, seq in g_utils.read_fasta(ref_file):
        ref_dict[chrid] = (seq, len(seq))

    g_utils.logging("[DEBUG] loading reference done", args)
    bc_refdict = sc.broadcast(ref_dict)

    ## read from hadoop
    readRDD = sc.textFile( args.input ) \
                .map( lambda x: g_utils.line2kv( x))

    if args.testmode == "balancing":
        readRDD = readRDD.partitionBy(args.nodes)

    readRDD = readRDD.cache()

    ## transform and get result of bowtie
    c2tTransRDD = readRDD.mapValues(lambda x: (x[0].translate(
        g_utils.make_trans_with("W", "C", "T")), x[1]))
    c2tMapRDD = c2tTransRDD.mapPartitionsWithIndex(
        lambda i, ptn: a_utils.mapping(i, "C2T", ["W_C2T", "C_C2T"], ptn, args
                                       ))

    g2aTransRDD = readRDD.mapValues(lambda x: (x[0].translate(
        g_utils.make_trans_with("W", "G", "A")), x[1]))
    g2aMapRDD = g2aTransRDD.mapPartitionsWithIndex(
        lambda i, ptn: a_utils.mapping(i, "G2A", ["W_G2A", "C_G2A"], ptn, args
                                       ))

    mergedRDD = sc.union([readRDD, c2tMapRDD, g2aMapRDD])
    combRDD = mergedRDD.combineByKey( lambda v: [v],\
                                      lambda lst, v: lst + [v],\
                                      lambda l1, l2: l1 + l2 )
    filteredRDD = combRDD.mapValues( lambda x: a_utils.select_and_find_uniq_alignment( x))\
                          .filter( lambda (k, v): v is not None )
    # .filter( lambda (k, v): not (v is None))

    if args.testmode == "balancing":
        filteredRDD = filteredRDD.partitionBy(args.nodes)


    methylRDD = filteredRDD.map( lambda x: a_utils.calc_methyl(x, bc_refdict.value, args.num_mm) )\
                            .filter( lambda x: x is not None )

    result_path = os.path.join(args.output, "alignment")
    methylRDD.map(lambda x: a_utils.res_to_string(x)).saveAsTextFile(
        result_path)

    return result_path
Ejemplo n.º 5
0
def retrieve_orfs(orfs,fastaFile,orfFile):
    orfFile = open(orfFile,'w')
    for header,seq in read_fasta(fastaFile):
        header = header.split()[0]
        if header in orfs:
            if orfs[header][2] == '+':
                orfFile.write('>%s\n%s\n' %(header,seq[orfs[header][0]-1:orfs[header][1]]))
            else:
                rev_seq = reverse_complement(seq[orfs[header][0]-1:orfs[header][1]])
                orfFile.write('>%s\n%s\n' %(header,rev_seq))
Ejemplo n.º 6
0
def tally_gc_counts(fasta_dict=read_fasta()):
    results_dict = dict()
    for k in sorted(fasta_dict.keys()):
        counter = collections.Counter(fasta_dict[k])
        total = sum(counter.values())
        gc_count = counter['G'] + counter['C']

        gc_percent = 100.0 * gc_count / total
        results_dict[k] = gc_percent
    return sorted(results_dict.items(), key=results_dict.get)[-1]
Ejemplo n.º 7
0
def run(args):
    fasta = read_fasta(args.fasta, args.logfile)
    gtf = read_gtf(args.gtf, args.logfile)

    out("Random Indexing Test:", args.logfile)
    for i, (ID, sequence) in enumerate(fasta.items()):

        print(ID)
        if "|" in ID:
            ID = ID.split("|")[3]
        gtf_items = gtf[ID]
        if not gtf_items:
            ID = ".".join(ID.split(".")[:-1])
            gtf_items = gtf[ID]
        print(ID)

        gtf_items = gtf[ID]
        exons = [(start, end) for _, start, end, _ in gtf_items]
        print(gtf_items)
        strand = gtf_items[0][3]
        chrom = gtf_items[0][0]

        out("Transcript ID: {}".format(ID), args.logfile)
        out(
            "len: {:4d} sequence[:20]: {}".format(len(sequence),
                                                  sequence[:20]), args.logfile)
        out("gtf_items: {}".format(gtf_items), args.logfile)

        # DEBUG: Checking that the exon lengths sum up to the length of the sequence
        total_length = 0
        for _, start, stop, _ in gtf_items:
            assert stop > start
            total_length += stop - start
        assert total_length == len(sequence), "{} != {}\n{}\n".format(
            total_length, len(sequence), ID, gtf_items)
        assert strand == "+" or strand == "-", "{}".format(strand)

        out("exons: {}".format(exons), args.logfile)
        for _ in range(10):
            queried_index = np.random.randint(0, len(sequence))
            genomic_index = find_location(queried_index, exons, strand,
                                          sequence)
            out(
                "Queried Index: {}\tGenomic index: {}".format(
                    queried_index, genomic_index), args.logfile)
            out(
                "Transcript window around queried index: {}".format(
                    sequence[queried_index - 10:queried_index + 11]),
                args.logfile)

        out("", args.logfile)
        if i > 20:
            exit()

    out("", args.logfile)
Ejemplo n.º 8
0
def train(args):
    '''
    This is where the model is created & trained.
    '''
    # Loading the protein fasta sequences
    orig_seqs = []
    with open(args.input_file, 'r') as infasta:
        for _, seq in utils.read_fasta(infasta):
            orig_seqs.append(seq)

    # Prepare training data
    data_x, data_y, vocab_size, vocab_decode = \
        utils.load_data(orig_seqs, args.seq_length)

    # Creating the RNN-LSTM model
    rnn_model = Sequential()
    # Add first layer with input shape provided
    rnn_model.add(
        LSTM(args.hidden_dim,
             input_shape=(None, vocab_size),
             return_sequences=True))
    # Add the remaining layers
    for i in range(args.num_layers - 1):
        rnn_model.add(LSTM(args.hidden_dim, return_sequences=True))
    # We treat each char in the vocbulary as an independet time step
    # and use TimeDistributed wrapper to apply the same dense layer (with
    # number of units = vocab_size) and same weights to output one
    # time step from the sequence for each time step in the input.
    # In other words, we will process all the time steps of the input
    # sequence at the same time.
    rnn_model.add(TimeDistributed(Dense(vocab_size)))
    rnn_model.add(Activation('softmax'))
    rnn_model.compile(loss="categorical_crossentropy", optimizer="rmsprop")

    # Train the model
    rnn_model.fit(data_x,
                  data_y,
                  batch_size=args.batch_size,
                  verbose=1,
                  epochs=args.train_epochs)

    # Generate new sequences
    proteins = []
    for i in range(args.generate_epochs):
        new_pep = utils.generate_seq(rnn_model, args.seq_length, vocab_size,
                                     vocab_decode)
        proteins.append(new_pep)

    # Write new protein sequences to a fasta file
    utils.write_fasta(proteins, args.output_file)
Ejemplo n.º 9
0
def load_fasta_sequences(fasta_file1, fasta_file2, fasta1_to_fasta2_blast):
    fasta_sequences = {
        ''.join(os.path.split(filename)[-1].split('.')[:-1]):
        [[headerStr, seq] for headerStr, seq in read_fasta(filename)]
        for filename in [fasta_file1, fasta_file2]
    }
    blast_results = load_blast_results(fasta1_to_fasta2_blast, [0, 1, 2])
    fasta_sources = list(fasta_sequences.keys())
    output_name = '-'.join(
        ['targets', f'{fasta_sources[1].split("_")[-1]}', 'blast'])
    cols = [v[0] for v in fasta_sequences[fasta_sources[0]]]
    rows = [v[0] for v in fasta_sequences[fasta_sources[1]]]
    values = pd.DataFrame(0.0, index=rows, columns=cols)
    for val in blast_results:
        values.at[val[1], val[0]] = val[2]
    values.to_csv(output_name + '.csv')
Ejemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser(
        description='Adopt pairwise aligner to generate identity matrix')
    parser.add_argument('fasta_path',
                        type=str,
                        help='the path of fasta files to parse')
    parser.add_argument('out_path', type=str, help='output dir')
    args = parser.parse_args([
        '/home/ZwZ/database/M/linsi/M1_M2_unique.fasta',
        '/home/ZwZ/database/M/linsi/M1_M2_unique.npy'
    ])

    fasta = read_fasta(args.fasta_path)

    identity_M = identity(fasta)

    np.save(args.out_path, identity_M)
Ejemplo n.º 11
0
def bx_multiplicity(rfile):
    """
    Calculate barcode multiplicity given a read file name. Returns a dictionary
    of barcode -> num reads with that barcode.
    """
    bxs = {}
    if rfile == "-":
        rfile = "/dev/stdin"
    with open(rfile) as reads:
        if not reads.isatty():
            for _, _, bx, _ in read_fasta(reads):
                if bx != None:
                    bxs.setdefault(bx, 0)
                    bxs[bx] += 1
        else:
            raise RuntimeError(
                "Reads must be piped from stdin if file name is not provided")
    return bxs
Ejemplo n.º 12
0
    def insert_peptide_sequences_into_db(self, peptide_fasta):
        """
        Insert peptide sequences into DB using peptide FASTA file.
        NOTE: Risk for high memory consumption as entire FASTA is kept in memory.
        """
        sequences = {
            header.split()[0]: sequence
            for header, sequence in read_fasta(peptide_fasta,
                                               keep_formatting=False)
        }

        discriminative_peptides = self.db.execute(
            "SELECT peptide FROM peptides").fetchall()
        for peptide in discriminative_peptides:
            self.db.execute(
                "UPDATE peptides SET sequence = ? WHERE peptide = ?",
                (sequences[peptide[0]], peptide[0]))
            #logging.debug("Added sequence %s for peptide %s", sequences[peptide[0]], peptide[0]) # TODO: verbose
        self.db.commit()
Ejemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser(
        'filter out identical sequence and write a clean fasta file')
    parser.add_argument(
        'in_path',
        type=str,
        help='please give an absolute path of in put fasta file',
        default='/home/ZwZ/database/M/linsi/M1_M2_all.fasta')
    parser.add_argument('out_path',
                        type=str,
                        help='the absolute path of filtered fasta file',
                        default='/home/ZwZ/database/M/linsi/filter.fasta')
    args = parser.parse_args()

    ALL = utils.read_fasta(args.in_path, True)

    keep, discard, _ = search_identical(ALL)  # the most important func
    all = [ALL[i] for i in keep]

    SeqIO.write(all, args.out_path, 'fasta')
Ejemplo n.º 14
0
def load_data():
    ngram_df = pd.read_pickle('data/ngrams.pkl')
    vocab = {}
    for key, gram in enumerate(ngram_df['ngrams']):
        vocab[gram] = key + 1
    gram_len = len(ngram_df['ngrams'][0])
    print('Gram length:', gram_len)
    print('Vocabulary size:', len(vocab))

    ngrams = list()
    proteins = list()
    f = open('data/swissprot.fasta')
    prots, seqs = read_fasta(f.readlines())
    for protein, seq in zip(prots, seqs):
        if not is_ok(seq) or len(seq) - gram_len + 1 > MAXLEN:
            continue
        proteins.append(protein)
        grams = list()
        for i in range(len(seq) - gram_len + 1):
            grams.append(vocab[seq[i: (i + gram_len)]])
        ngrams.append(grams)
        
    df = pd.DataFrame({
        'proteins': proteins,
        'ngrams': ngrams,
    })

    def get_values(df):
        grows = []
        gcols = []
        gdata = []
        for i, row in enumerate(df.itertuples()):
            for j in range(len(row.ngrams)):
                grows.append(i)
                gcols.append(j)
                gdata.append(row.ngrams[j])
        data = sparse.csr_matrix((gdata, (grows, gcols)), shape=(len(df), MAXLEN))
        return data

    return proteins, get_values(df)
Ejemplo n.º 15
0
def main():
    min = 200
    max = 370
    #parameters that used
    mis = 0
    #read primer
    fpri, rpri = utils.read_trad_primer(sys.argv[1])
    lfile = sys.argv[2]
    num = 0
    seq = utils.read_fasta(sys.argv[2])
    for item in seq.items():
        se = item[1]
        for f in fpri:
            for i in range(0, len(se)-len(f)):
                str1 = se[i:len(f)+i]
                #print str1, f
                #print mismatch(str1,f,mis)
                if mismatch(str1,f,mis):
                    #print "match"
                    num += 1
                    output = ">F:"+lfile+"_"+str(num)+"\n"+f+'\n'
                    #print output
                    #print ">F:",f, i
                    for r in rpri:
                        rp = r
                        for j in range(0, len(se)-len(rp)):
                            str2 = se[j:len(rp)+j]
                            if mismatch(str2,rp,mis):
                                #print "match"
                                frp = reverse_complement.get_rc(rp)
                                #print ">R:",frp,j+len(rp), j+len(rp)-i
                                flen = j+len(rp)-i
                                #print flen
                                if(flen >min and flen<max):
                                    output = output + ">R:"+lfile+"_"+str(num)+"\n"+frp+'\n'
                                    print output
Ejemplo n.º 16
0
 def _read(self, f):
     return utils.read_fasta(f)
Ejemplo n.º 17
0
parser.add_argument("--step", "-s", default=15, help="the length between each two fragments")
parser.add_argument("--flen", "-fl", default=53, help="the length for each fragment")
parser.add_argument("--prob", "-p", default=0.5, help="the probability for selecting a target site")
parser.add_argument("--numsite", "-ns", default=0, help="the number of target sites")

args = parser.parse_args()

fasta_name = args.input1
onemirna = args.input2
outf = args.output
step = args.step
fraglen = args.flen
prob = args.prob
nums = args.numsite

headers, mrnaseqs = read_fasta(fasta_name)

# prepare the input data
mirnaf = open(onemirna, 'r')
mirnaseq = mirnaf.read().rstrip('\r\n').upper()

print(mirnaseq)
remirna = "".join(reversed(mirnaseq))
mirnaseq = remirna
print(remirna) 

maxlen = 79
maxlenmir = 26
mirnalen = len(mirnaseq)

if mirnalen < maxlenmir:
Ejemplo n.º 18
0
import numpy as np
import random

#read in sites
posfile = '/Users/student/Documents/Algorithms/Alg_final_project/data/rap1-lieb-positives.txt'
negfile = '/Users/student/Documents/Algorithms/Alg_final_project/data/yeast-upstream-1k-negative.fa'
testfile = '/Users/student/Documents/Algorithms/Alg_final_project/data/rap1-lieb-test.txt'
poslist = util.read_pos(posfile)
finaltestlist = util.read_pos(testfile)
posreversecomp = []
for i in poslist:
    posreversecomp.append(util.reverse_complement(i))

poslist = poslist + posreversecomp

neglist = util.read_fasta(negfile)
negreversecomp = []
for i in neglist:
    negreversecomp.append(util.reverse_complement(i))
neglist = neglist + negreversecomp

for i in neglist:
    if i in set(poslist):
        neglist.remove(i)

#print('negs',neglist[:10])
print('neg', len(neglist))
print('pos', len(poslist))
shortneg = []
for i in neglist:
    #TODO adapt so this is a random slice of the negative
Ejemplo n.º 19
0
 def _read(self, f):
     s, t = utils.read_fasta(f, dna_only=True)
     return s, t
Ejemplo n.º 20
0
 def load_sequences(self) -> list:
     self.initial_alignment_df = read_fasta(self.file_name)
     return [sequence for sequence in self.initial_alignment_df.get("sequence")]
Ejemplo n.º 21
0
parser.add_argument("--numsite",
                    "-ns",
                    default=0,
                    help="the number of called target sites")

args = parser.parse_args()

fasta_mrna = args.input1
fasta_mirna = args.input2
outf = args.output
step = args.step
fraglen = args.flen
prob = args.prob
nums = args.numsite

hmrna, mrnaseqs = read_fasta(fasta_mrna)
hmirna, mirnaseqs = read_fasta(fasta_mirna)

# prepare the input data
remirnas = []
for one in mirnaseqs:
    oneremirna = "".join(reversed(one))
    remirnas.append(oneremirna.upper())

print(mirnaseqs[0])
print(remirnas[0])
mirnaseqs = remirnas

maxlen = 79
maxlenmir = 26
mirnalen = len(min(mirnaseqs, key=len))
Ejemplo n.º 22
0
import numpy as np
from utils import read_fasta, estimate_population, generate_population

initial_alignment_df = read_fasta(filename='dataset/BB11001-m.fa')

# initial_alignment = [np.fromstring(sequence, dtype=np.uint8) for sequence in initial_alignment_df.get("sequence")]
initial_alignment = [
    sequence for sequence in initial_alignment_df.get("sequence")
]

# initial_alignment = np.array(initial_alignment)

population = generate_population(size=20, alignment=initial_alignment)
print(population)
print(estimate_population(population))
#
# print(f'Estimated sequence value: {estimated_value}')
Ejemplo n.º 23
0
def main(args):
    # Mapping COG id to proteins
    cog_prot = defaultdict(lambda: defaultdict(list))
    # Mapping protein to COG ids
    prot_cog = defaultdict(lambda: defaultdict(list))

    if not os.path.exists(args.dest):
        print >> sys.stderr, 'Creating directory %s' % args.dest
        os.makedirs(args.dest)
    if not os.path.isdir(args.dest): sys.exit('Destination is not a directory')

    if args.cog_csv:
        cog_csv = (l.strip('\n').split(',') for l in args.cog_csv)
    else:
        if not os.path.exists('%s/cog2003-2014.csv' % args.dest):
            print >> sys.stderr, 'Downloading COG csv from NCBI'
            response = urllib2.urlopen(
                'ftp://ftp.ncbi.nih.gov/pub/COG/COG2014/data/cog2003-2014.csv')
            with open('%s/cog2003-2014.csv' % args.dest, 'w') as outh:
                outh.write(response.read())
        cog_csv = (l.strip('\n').split(',')
                   for l in open('%s/cog2003-2014.csv' % args.dest, 'r'))

    for l in cog_csv:
        protid = l[2]
        cogid = l[6]
        spos = int(l[4])
        epos = int(l[5])
        cog_prot[cogid][protid] = merge_interval_list(cog_prot[cogid][protid] +
                                                      [(spos, epos)],
                                                      dist=1)
        prot_cog[protid][cogid] = merge_interval_list(prot_cog[protid][cogid] +
                                                      [(spos, epos)],
                                                      dist=1)

    print >> sys.stderr, "Found {:,} COGs".format(len(cog_prot))
    print >> sys.stderr, "Found {:,} protein ids".format(len(prot_cog))
    c = Counter(len(v) for k, v in prot_cog.iteritems())
    print >> sys.stderr, "Proteins belonging to multiple COGs:"
    print >> sys.stderr, 'Num COGs  | Count'
    for k in range(10):
        print >> sys.stderr, '%s%d' % (str(k + 1).ljust(12), c[k + 1])
    print >> sys.stderr, '%s%d' % ('11+'.ljust(12),
                                   sum(v for k, v in c.iteritems() if k > 10))

    if args.fasta:
        seqiter = (
            (seqname, seq)
            for seqname, seq in read_fasta(gzip.GzipFile(fileobj=args.fasta)))
    else:
        if not os.path.exists('%s/prot2003-2014.fa.gz' % args.dest):
            print >> sys.stderr, 'Downloading protein sequences from NCBI'
            response = urllib2.urlopen(
                'ftp://ftp.ncbi.nih.gov/pub/COG/COG2014/data/prot2003-2014.fa.gz'
            )
            with open('%s/prot2003-2014.fa.gz' % args.dest, 'wb') as outh:
                outh.write(response.read())
        seqiter = ((seqname, seq) for seqname, seq in read_fasta(
            gzip.open('%s/prot2003-2014.fa.gz' % args.dest, 'rb')))

    # Clear out destination
    purge_msg = True
    for cogid in cog_prot.keys():
        if os.path.exists('%s/%s/%s.faa' % (args.dest, cogid[:5], cogid)):
            if purge_msg:
                print >> sys.stderr, 'Purging existing sequence files'
                purge_msg = False
            os.remove('%s/%s/%s.faa' % (args.dest, cogid[:5], cogid))

    # Create the COG files
    numseqs = 0
    filenames = {}
    seqcounts = Counter()
    for seqname, seq in seqiter:
        try:
            m = re.search('gi\|(\d+)\|ref', seqname)
            protid = m.group(1)
        except (AttributeError, ValueError):
            print >> sys.stderr, 'Error parsing sequence name: "%s"' % seqname
            continue
        numseqs += 1
        if not numseqs % 100000:
            print >> sys.stderr, 'Processed %d proteins...' % numseqs
        for cogid, ivs in prot_cog[protid].iteritems():
            for istart, iend in ivs:
                newseqname = 'cog|%s|%s (%d-%d)' % (cogid, seqname, istart,
                                                    iend)
                newseq = seq[istart - 1:iend]
                seqcounts[cogid] += 1
                if not os.path.exists('%s/%s' % (args.dest, cogid[:5])):
                    os.makedirs('%s/%s' % (args.dest, cogid[:5]))
                with open('%s/%s/%s.faa' % (args.dest, cogid[:5], cogid),
                          'a') as outh:
                    print >> outh, '>%s\n%s' % (newseqname, fmt_seq(newseq))
                filenames[cogid] = '%s/%s.faa' % (cogid[:5], cogid)

    print >> sys.stderr, "Processed %d total proteins" % numseqs
    with open('%s/fungroups.txt' % args.dest, 'w') as outh:
        for k in sorted(filenames.keys()):
            print >> outh, '%s\t%s\t%d' % (k, filenames[k], seqcounts[k])
the label of the next string.

In Rosalind's implementation, a string in FASTA format will be labeled by the ID "Rosalind_xxxx", where
"xxxx" denotes a four-digit code between 0000 and 9999.

Given: At most 10 DNA strings in FASTA format (of length at most 1 kbp each).

Return: The ID of the string having the highest GC-content, followed by the GC-content of that string. Rosalind
allows for a default error of 0.001 in all decimal answers unless otherwise stated; please see the note on
absolute error below.
"""

from utils import read_fasta, gc_content


if __name__ == '__main__':
    sequences = read_fasta('data/rosalind_gc.txt')

    max_gc = 0
    max_id = None

    for id, s in sequences.items():
        gc = gc_content(s)

        if gc > max_gc:
            max_gc = gc
            max_id = id

    print max_id
    print max_gc * 100
Ejemplo n.º 25
0
downstream_model_names = [
    '56_Linear.pth', '66_Linear.pth', '68_Linear.pth', '71_Linear.pth',
    '69_Linear.pth'
]
downstream_models = [
    torch.load(os.path.join(model_path, model_name), map_location=device)
    for model_name in downstream_model_names
]

pdb_file = 'C:\\Workspace\\PE\\UI\\saved_models\\1emm_mod.pdb'
atom_lines = utils.process_pdb(pdb_file, atoms_type=['N', 'CA', 'C', 'O'])

coord_array_ca, acid_array, coord_array = utils.extract_coord(
    atom_lines, atoms_type=['N', 'CA', 'C', 'O'])
utils.compare_len(coord_array, acid_array, ['N', 'CA', 'C', 'O'])
seq_dict = utils.read_fasta(seq_file)

with open(os.path.join(files_path, 'results.txt'), 'a') as f:
    f.write('sequence_name\tpredicted_value\n')

for seq_name, seq in seq_dict.items():
    seq_dict[seq_name] = utils.seq2array(seq)
    array = utils.get_knn_135(coord_array, seq)
    input_ = torch.tensor(array.reshape(-1, 135)).float()
    hidden = pretrained_model(input_).squeeze(1)

    output = 0
    for model in downstream_models:
        model.eval()
        model.is_training = False
        output += model(hidden)
Ejemplo n.º 26
0
 def write_fasta(self):
     self.initial_alignment_df = read_fasta(self.file_name)
     with open("BB11001.aln", "w") as file_handler:
         for index, id in enumerate(self.initial_alignment_df.get("id")):
             file_handler.write(">" + id + "\n")
             file_handler.write(self.best_solution[index] + "\n")
Ejemplo n.º 27
0
def parse_orfs(orfFinderOut,orfFile,minLength):
    orfOut = open(orfFile,'w')
    for header, seq in read_fasta(orfFinderOut,False):
        if len(seq) > minLength:
            seq = find_common_startcodon(seq,minLength)
            orfOut.write('>%s\n%s\n' %(header,seq))
Ejemplo n.º 28
0
    parser.add_argument('--depth', default=7, help="min,max")
    parser.add_argument('--period', default=1, help="min, max")
    parser.add_argument('-k', default=1, help="k-fold validation")
    parser.add_argument('model', nargs='+')

    args = parser.parse_args()

    seqs = []
    labels = dict()
    all_labels = set()

    for f in args.model:
        label = ".".join(os.path.split(f)[-1].split(".")[:-1])
        all_labels.add(label)

        for seq in utils.read_fasta(f):
            seqs.append(seq)
            labels[seq["seqid"]] = label

    best_args = [12, 7, 1, 100]

    print "width\tdepth\tperiod\terror_rate"
    for w in get_range(args.window):
        for d in get_range(args.depth):
            if w < d:
                continue

            for p in get_range(args.period):
                errors = utils.cross_validate(seqs, labels, int(args.k), window=w, depth=d, period=p)
                avg_error = sum(errors) / len(errors)
Ejemplo n.º 29
0
def main(go_file, train_sequences_file, train_annotations_file,
         test_sequences_file, test_annotations_file, out_terms_file,
         train_data_file, test_data_file, min_count):
    logging.info('Loading GO')
    go = Ontology(go_file, with_rels=True)

    logging.info('Loading training annotations')
    train_annots = {}
    with open(train_annotations_file, 'r') as f:
        for line in f:
            it = line.strip().split('\t')
            prot_id = it[0]
            if prot_id not in train_annots:
                train_annots[prot_id] = set()
            go_id = it[1]
            train_annots[prot_id].add(go_id)

    logging.info('Loading training sequences')
    info, seqs = read_fasta(train_sequences_file)
    proteins = []
    sequences = []
    annotations = []
    for prot_info, sequence in zip(info, seqs):
        prot_id = prot_info.split()[0]
        if prot_id in train_annots:
            proteins.append(prot_id)
            sequences.append(sequence)
            annotations.append(train_annots[prot_id])

    prop_annotations = []
    cnt = Counter()
    for annots in annotations:
        # Propagate annotations
        annots_set = set()
        for go_id in annots:
            annots_set |= go.get_anchestors(go_id)
        prop_annotations.append(annots_set)
        for go_id in annots_set:
            cnt[go_id] += 1

    df = pd.DataFrame({
        'proteins': proteins,
        'sequences': sequences,
        'annotations': prop_annotations,
    })
    logging.info(f'Train proteins: {len(df)}')
    logging.info(f'Saving training data to {train_data_file}')
    df.to_pickle(train_data_file)

    # Filter terms with annotations more than min_count
    res = {}
    for key, val in cnt.items():
        if val >= min_count:
            ont = key.split(':')[0]
            if ont not in res:
                res[ont] = []
            res[ont].append(key)
    terms = []
    for key, val in res.items():
        terms += val

    logging.info(f'Number of terms {len(terms)}')
    logging.info(f'Saving terms to {out_terms_file}')

    df = pd.DataFrame({'terms': terms})
    df.to_pickle(out_terms_file)

    logging.info('Loading testing annotations')
    test_annots = {}
    with open(test_annotations_file, 'r') as f:
        for line in f:
            it = line.strip().split('\t')
            prot_id = it[0]
            if prot_id not in test_annots:
                test_annots[prot_id] = set()
            go_id = it[1]
            test_annots[prot_id].add(go_id)

    logging.info('Loading testing sequences')
    info, seqs = read_fasta(test_sequences_file)
    proteins = []
    sequences = []
    annotations = []
    for prot_info, sequence in zip(info, seqs):
        prot_id = prot_info.split()[0]
        if prot_id in test_annots:
            proteins.append(prot_id)
            sequences.append(sequence)
            annotations.append(test_annots[prot_id])

    prop_annotations = []
    for annots in annotations:
        # Propagate annotations
        annots_set = set()
        for go_id in annots:
            annots_set |= go.get_anchestors(go_id)
        prop_annotations.append(annots_set)

    df = pd.DataFrame({
        'proteins': proteins,
        'sequences': sequences,
        'annotations': prop_annotations,
    })
    logging.info(f'Test proteins {len(df)}')
    logging.info(f'Saving testing data to {test_data_file}')
    df.to_pickle(test_data_file)
Ejemplo n.º 30
0
    assert len(set(len(x) for x in fasta.values())) <= 1

    profile = collections.defaultdict(lambda: [0] * seq_len)
    for i in range(seq_len):
        for base in 'ACGT':
            for dna_seq in fasta.values():
                if dna_seq[i] == base:
                    profile[base][i] += 1

    return dict(profile)


def calculate_consensus():
    seq_len = len(list(fasta.values())[0])
    consensus = []

    profile = build_profile_matrix()
    for j in range(seq_len):
        max_count = 0, ''
        for base in profile:
            if profile[base][j] > max_count[0]:
                max_count = profile[base][j], base
        consensus.append(max_count)
    return ''.join([i[1] for i in consensus])


fasta = read_fasta()
print(calculate_consensus())
for base, profile in build_profile_matrix().items():
    print('%s: %s' % (base, ' '.join(str(i) for i in profile)))
Ejemplo n.º 31
0
 def _read(self, f):
     dnas = [list(dna) for _, dna in utils.read_fasta(f)]
     return np.asarray(dnas)
Ejemplo n.º 32
0
from sys import argv, exit
from utils import read_fasta as read_fasta

if len(argv) < 2:
    print("usage: script.py FASTA discriminative_peptides.txt")
    exit(1)

fastafile = argv[1]
discpeps = argv[2]

disc = {}
with open(discpeps) as f:
    for line in f:
        if line.startswith("-"):
            break
        peptide, families = line.strip().split(" ", maxsplit=1)
        translation = str.maketrans("{'}", "   ")
        families = families.translate(translation).split(",")
        disc[peptide] = [fam.strip() for fam in families]

disc_seqs = {}
for header, sequence in read_fasta(fastafile):
    try:
        disc_seqs[sequence] = ", ".join(disc[header])
    except KeyError:
        pass

for seq, target in disc_seqs.items():
    print(seq, target, sep="\t")
Ejemplo n.º 33
0
def run(args):
    sequences = read_fasta(args.fasta,
                           args.logfile) if args.fasta is not None else None
    if args.gtf is not None:
        ref = read_gtf(args.gtf, args.logfile)
    elif args.bed is not None:
        ref = read_bed(args.bed, args.logfile)

    if ref is None:
        raise NotImplementedError

    meRanCall = read_meRanCall(args.input, args.logfile)
    outfile = open(args.output, "w")
    errfile = open(args.error_out, "w")

    header = "# <chromosome> <genomic position> <strand> <methylation rate> <transcript ID> <transcript position>\n"
    outfile.write(header)

    sequence = None
    not_found_errors = 0
    not_found_error_transcripts = set()
    sequence_length_check_errors = 0
    sequence_length_check_error_transcripts = set()

    for ID, refPos, refStrand, methRate in meRanCall:
        if sequences is not None:
            sequence = sequences[ID]

        out("ID:{} position:{}".format(ID, refPos), args.logfile)
        if "|" in ID:
            ID = ID.split("|")[3]

        if ID not in ref:
            out("Error: Could not find ID {} in reference file".format(ID),
                args.logfile)
            errfile.write(
                "Error: Could not find ID {} in reference file\n".format(ID))
            not_found_errors += 1
            not_found_error_transcripts.add(ID)
            continue
        coordinates = ref[ID]
        assert refStrand == "+"

        if sequence is not None:
            total_length = 0
            for _, start, stop, _ in coordinates:
                total_length += stop - start
            if total_length != len(sequence):
                out(
                    "Error: The lengths of the exons do not match the overall length of the sequence: exons {} != seq {}"
                    .format(total_length, len(sequence)), args.logfile)

                if len(sequence) < total_length:
                    errfile.write(
                        "Error: The lengths of the exons of ID {} do not match the overall length of the sequence: exons {} != seq {}\n"
                        .format(ID, total_length, len(sequence)))
                    sequence_length_check_errors += 1
                    sequence_length_check_error_transcripts.add(ID)
                    continue

                out("Attempting to handle Poly-A...", args.logfile)
                length_diff = len(sequence) - total_length
                last_chunk = sequence[-length_diff:]
                if last_chunk == "A" * length_diff:
                    sequence = sequence[:-length_diff]
                    out("Handled this case by cutting off Poly-A",
                        args.logfile)
                    assert total_length == len(sequence)
                else:
                    out("Could not handle this case by cutting off Poly-A",
                        args.logfile)
                    errfile.write(
                        "Error: The lengths of the exons of ID {} do not match the overall length of the sequence: exons {} != seq {}\n"
                        .format(ID, total_length, len(sequence)))
                    sequence_length_check_errors += 1
                    sequence_length_check_error_transcripts.add(ID)
                    continue

        strand = coordinates[0][3]
        chrom = coordinates[0][0]
        genomic_position = find_location(refPos, coordinates)

        line = "\t".join(
            (chrom, str(genomic_position), strand, str(methRate), ID,
             str(refPos))) + "\n"
        outfile.write(line)

    out("Finished writing to: {}".format(args.output), args.logfile)
    out(
        "Number of sites whose transcript was not found in the reference file: {}"
        .format(not_found_errors), args.logfile)
    out(
        "Number of sites whose transcript sequence length did not equal the sum of all the exons: {}"
        .format(sequence_length_check_errors), args.logfile)
    out("Total meRanCall lines: {}".format(len(meRanCall)), args.logfile)
    out("", args.logfile)
    out(
        "Number of transcripts that were not found in the reference file: {}".
        format(len(not_found_error_transcripts)), args.logfile)
    out(
        "Number of transcripts whose length did not equal the sum of all the exons: {}"
        .format(len(sequence_length_check_error_transcripts)), args.logfile)
    out(
        "Total number of transcripts read in the reference file: {}".format(
            len(ref)), args.logfile)

    errfile.write(
        "Number of sites whose transcript was not found in the reference file: {}\n"
        .format(not_found_errors))
    errfile.write(
        "Number of sites whose transcript sequence length did not equal the sum of all the exons: {}\n"
        .format(sequence_length_check_errors))
    errfile.write("Total meRanCall lines: {}\n".format(len(meRanCall)))
    errfile.write("\n")
    errfile.write(
        "Number of transcripts that were not found in the reference file: {}\n"
        .format(len(not_found_error_transcripts)))
    errfile.write(
        "Number of transcripts whose length did not equal the sum of all the exons: {}\n"
        .format(len(sequence_length_check_error_transcripts)))
    errfile.write(
        "Total number of transcripts read in the reference file: {}\n".format(
            len(ref)))
Ejemplo n.º 34
0
 def _read(self, f):
     return utils.read_fasta(f, dna_only=True)[0]
Ejemplo n.º 35
0
        letter, _ = c.most_common(1)[0]
        consensus += letter
    return consensus, profile

def display(symbols, consensus, profile):
    s = consensus + "\n"
    for symb, counts in zip(symbols, profile):
        s = s + f"{symb}: {list_to_string(counts)}\n"
    return s

if __name__ == "__main__":

    symbols = "ACGT"

    dnas = [
        "ATCCAGCT",
        "GGGCAACT",
        "ATGGATCT",
        "AAGCAACC",
        "TTGGAACT",
        "ATGCCATT",
        "ATGGCACT",
    ]
    display(symbols, *profile(symbols, dnas))

    dnas = read_fasta("inps/rosalind_cons.txt")
    print(dnas)
    s = display(symbols, *profile(symbols, dnas))
    with open("outs/rosalind_cons.txt", "w") as f:
        f.write(s)