Example #1
0
def iterate_handle_threaded(handle, ref, database='data/gsaid.db'):
    """
    Function to iterate through handle passed by Chunky bot
    :param cursor: sqlite database handler
    :param handle: list containing tuples (header, raw seq)
    :param ref: file containing reference sequence, default-> NC_04552.fa
    """

    _, refseq = gotoh2.convert_fasta(open(ref))[0]

    in_queue = queue.Queue()
    out_queue = queue.Queue()

    # stream records into queue
    for h, s in handle:
        in_queue.put((h, s, database, refseq))

    # pairwise align sequences in queue
    for seq in range(0, in_queue.qsize()):
        AlignerThread(in_queue, out_queue).start()

    # block until queue is processed
    in_queue.join()

    # update database with aligned sequences
    cursor, conn = open_connection(database)
    while not out_queue.empty():
        header, seq, alignedseq = out_queue.get()
        insert_seq(cursor, seq, header, alignedseq)
        conn.commit()
        out_queue.task_done()
    conn.close()
    out_queue.join()
Example #2
0
def iterate_fasta(fasta, ref, database = 'data/gsaid.db'):
	""" Function to iterate through fasta file
		:params:
			:cursor: sqlite database handler
			:fasta: file containing sequences
			:ref: file containing reference sequence, default-> NC_04552.fa
	"""
	handle = gotoh2.iter_fasta(open(fasta))
	_, refseq = gotoh2.convert_fasta(open(ref))[0]

	in_queue= queue.Queue()
	out_queue= queue.Queue()

	for h, s in handle:
		in_queue.put((h, s, database, refseq))
	for seq in range(0,in_queue.qsize()):
		alignerThread(in_queue, out_queue).start()
		print(threading.active_count())
	in_queue.join()
	cursor, conn = open_connection(database)
	while not out_queue.empty():
		header, seq, alignedseq = out_queue.get()
		insert_seq(cursor, seq, header, alignedseq)
		conn.commit()
		out_queue.task_done()
	conn.close()
	out_queue.join()
Example #3
0
def iterate_fasta(fasta, ref, database='data/gsaid.db'):
    """
    Function to iterate through fasta file *non threaded*
    :param cursor: sqlite database handler
    :param fasta: file containing sequences
    :param ref: file containing reference sequence, default-> NC_04552.fa
    """
    handle = gotoh2.iter_fasta(open(fasta))
    _, refseq = gotoh2.convert_fasta(open(ref))[0]
    cursor, conn = open_connection(database)

    # stream records into queue
    for header, seq in handle:
        alignedseq = find_seq(conn, seq, refseq)
        insert_seq(cursor, seq, header, alignedseq)
        conn.commit()
    conn.close()
Example #4
0
def iterate_handle(handle, ref, database='data/gsaid.db'):
    """
    Function to iterate through list [(header,seq)....] passed by ChunkyBot ##NON THREADED##
    :param cursor: sqlite database handler
    :param handle: list containing tuples (header, raw seq)
    :param ref: file containing reference sequence, default-> NC_04552.fa
    """

    _, refseq = gotoh2.convert_fasta(open(ref))[0]
    cursor, conn = open_connection(database)

    #align and insert into database
    for header, seq in handle:
        alignedseq = find_seq(conn, seq, refseq)
        #if alignedseq returns 0; raw seq length <5,000
        if alignedseq == 0:
            print('Sequence {} with a length of {} cannot be aligned.'.format(
                header, len(seq)))
        else:
            print('Aligning {}.'.format(header))
            insert_seq(cursor, seq, header, alignedseq)
            conn.commit()
Example #5
0
            maxScore = 0
            maxIndex = -1

            # get the max probability score and its assosciated index
            for i in range(len(predictions[index])):
                if predictions[index][i] > maxScore:
                    maxScore = predictions[index][i]
                    maxIndex = i

            score = maxScore
            prediction = loaded_model.classes_[maxIndex]
            accession = idList[index].split('|')[1]
            payload = [accession, prediction, score, pangoLEARN.__version__, 'passed_qc', '']
            insert_lineage(cursor, payload)

    conn.commit()

if __name__ == '__main__':

    args = parse_args()
    sequence_handle = gotoh2.convert_fasta(open(args.sequences_file, 'r'))

    filtered_handle = filter_seqs(sequence_handle, args.filterout, max_prop_n=0.05, minlen=29000)

    if args.header-file == None and args.model-file == None:
        classify_and_insert(args.pangolindir+ 'decisionTreeHeaders_v1.joblib', args.pangolindir+'decisionTree_v1.joblib', filtered_handle, args.indicies, args.db)
    else:
        classify_and_insert(args.header_file, args.model_file, filtered_handle, args.indicies, args.db)