def test_build_water_relations(): '''it test the function that makes the relations between two sequences using a markx10 format file''' seq = 'ATGGCTTCATCCATTCTCTCATCCGCCGNTGTGGCCTTTGNCAACAGGGCTTCCCCTGCTCA' seq += 'AGCTAGCATGGGGGCACCATTCACTGGCCTAAAATCCGCCGCTGCTTTCCCNGTTTTATGTA' seq += 'CTGTTTTNACTCGCANGACCAACGACATCACCACTTTGGTTAGCAATGGGGGAAGAGTTCAG' seq += 'GGCNTGAAGGTGTGCCCACCACTTGGATTGAAGAAGTTCGAGACTCTTTCTTACCTTCCTGA' seq += 'TATGAGTAACGAGCAATTGGGAAAGGAAGTTGACTACCTTCTCAGGAAGGGATGGATTCCCT' seq += 'GCATTGAATTCGACATTCACAGTGGATTCGTTTACCGTGAGACCCACAGGTCACCAGGATAC' seq += 'TTCGATGGACGCTACTGGACCATGTGGAAGCTGCCCATGTTTGGCTGCACCGAT' seq2 = 'ATGGCTTCATCCATTCTCTCATCCGCCGNTGTGGCCTTTGNCAACAGGGCTTCCCTGCTCAA' seq2 += 'GCTAGCATGGGGGCACCATTCACTGGCCTAAAATCCGCCGCTGCTTTCCCNGTNACTCGCAN' seq2 += 'GACCAACGACATCACCACTTTGGTTAGCAATGGGGGAAGAGTTCAGGGCNTGAAGGTGTGCC' seq2 += 'CACCACTTGGATTGAAGAAGTTCGAGACTCTTTCTTACCTTCCTGATATGAGTAACGAGCAA' seq2 += 'TTGGGAAAGGAAGTTGACTACCTTCTCAGGAAGGGATGGATTCCCTGCATTGAATTCGACAT' seq2 += 'TCACAGTGGATTCGTTTACCGTGAGACCCACAGGTCACCAGGATACTTCGATGGACGCTAC' seq2 += 'TGGACCATGTGGAAGCTGCCCATGTTTGGCTGCACCGAT' subject_seq = SeqWithQuality(seq=Seq(seq), name='subject') query_seq = SeqWithQuality(seq=Seq(seq2), name='query') subject_fhand = temp_fasta_file(subject_seq) parameters = {'subject':subject_fhand.name} aligner = create_runner(tool='water', parameters=parameters) result_fhand = aligner(query_seq)['water'] relations = build_relations_from_aligment(result_fhand, query_name=query_seq.name, subject_name=subject_seq.name) assert relations == {'query': [(0, 50), (51, 112), (113, 409)], 'subject': [(0, 50), (52, 113), (129, 425)]}
def test_temp_fasta_file_seq_iter(): 'It test temp_fasta_file' seqrec1 = SeqWithQuality(seq=Seq('ATGATAGATAGATGF'), name='seq1') seqrec2 = SeqWithQuality(seq=Seq('ATGATAGATAGA'), name='seq2') seq_iter = iter([seqrec1, seqrec2]) fhand = temp_fasta_file(seq_iter) content = open(fhand.name).read() assert content == ">seq1\nATGATAGATAGATGF\n>seq2\nATGATAGATAGA\n"
def infer_introns_for_cdna(sequence, genomic_db, genomic_seqs_index=None, similar_sequence=None): "It infers the intron location in the cdna using est2genome" if not similar_sequence: # first we want to know where is the most similar seq in the genomic_db # this will speed up things similar_seqs = look_for_similar_sequences(sequence, database=genomic_db, blast_program="blastn") if not similar_seqs: return [] similar_seq = similar_seqs[0] else: similar_seq = similar_sequence start = similar_seq["subject_start"] end = similar_seq["subject_end"] try: similar_seq = genomic_seqs_index[similar_seq["name"]] except KeyError: msg = "Sequence %s was not found" % similar_seq["name"] raise KeyError(msg) # now we run est2genome for this cdna cdna_file = temp_fasta_file(seqs=[sequence]) similar_seq_file = temp_fasta_file(seqs=[similar_seq]) # we run est2genome cmd = [ "est2genome", cdna_file.name, similar_seq_file.name, "-sbegin2", str(start), "-send2", str(end), "-stdout", "-auto", ] stdout, stderr, retcode = call(cmd) if retcode: msg = "There was an error running est2genome: " + stderr raise RuntimeError(msg) # parse est2genome result = est2genome_parser(stdout) # get_introns_from parser_result return result["cdna"]["introns"]
def infer_introns_for_cdna(sequence, genomic_db, genomic_seqs_index=None, similar_sequence=None): 'It infers the intron location in the cdna using est2genome' if not similar_sequence: #first we want to know where is the most similar seq in the genomic_db #this will speed up things similar_seqs = look_for_similar_sequences(sequence, database=genomic_db, blast_program='blastn') if not similar_seqs: return [] similar_seq = similar_seqs[0] else: similar_seq = similar_sequence start = similar_seq['subject_start'] end = similar_seq['subject_end'] try: similar_seq = genomic_seqs_index[similar_seq['name']] except KeyError: msg = 'Sequence %s was not found' % similar_seq['name'] raise KeyError(msg) #now we run est2genome for this cdna cdna_file = temp_fasta_file(seqs=[sequence]) similar_seq_file = temp_fasta_file(seqs=[similar_seq]) #we run est2genome cmd = ['est2genome', cdna_file.name, similar_seq_file.name, '-sbegin2', str(start), '-send2', str(end), '-stdout', '-auto'] stdout, stderr, retcode = call(cmd) if retcode: msg = 'There was an error running est2genome: ' + stderr raise RuntimeError(msg) #parse est2genome result = est2genome_parser(stdout) #get_introns_from parser_result return result['cdna']['introns']
def _prepare_input_files(inputs, seqs): 'It prepares inputs taking into account the format' for key, value in inputs.items(): files_format = value['files_format'] inputs[key]['fhands'] = [] inputs[key]['fpaths'] = [] seqs, seqs_qual = itertools.tee(seqs, 2) for file_format in files_format: if file_format == 'fasta': fhand = temp_fasta_file(seqs=seqs) elif file_format == 'qual': fhand = temp_qual_file(seqs=seqs_qual) inputs[key]['fhands'].append(fhand) inputs[key]['fpaths'].append(fhand.name)
def _prepare_input_files(inputs, seqs): "It prepares inputs taking into account the format" for key, value in inputs.items(): files_format = value["files_format"] inputs[key]["fhands"] = [] inputs[key]["fpaths"] = [] seqs, seqs_qual = itertools.tee(seqs, 2) for file_format in files_format: if file_format == "fasta": fhand = temp_fasta_file(seqs=seqs) elif file_format == "qual": fhand = temp_qual_file(seqs=seqs_qual) inputs[key]["fhands"].append(fhand) inputs[key]["fpaths"].append(fhand.name)
def test_temp_fasta_file_one_seq(): 'It test temp_fasta_file' seqrec1 = SeqWithQuality(seq=Seq('ATGATAGATAGATGF'), name='seq1') fhand = temp_fasta_file(seqrec1) content = open(fhand.name).read() assert content == ">seq1\nATGATAGATAGATGF\n"
def _seq_to_fasta_fhand(seq): 'Given a fhand or Seq object it returns a fhand' if 'file' in seq.__class__.__name__.lower(): return seq return temp_fasta_file(seqs=[seq])
def test_strip_adaptor_blast(self): "It tests strip_vector_by_alignment with blastn-short" vec1 = SeqWithQuality(name="vec1", seq=Seq("atcgatcgatagcatacgat")) vec2 = SeqWithQuality(name="vec2", seq=Seq("atgcatcagatcgataaaga")) fhand_vectors = temp_fasta_file([vec1, vec2]) seq_trimmer = create_seq_trim_and_masker() strip_vector_by_alignment = create_adaptor_striper(fhand_vectors) seq = "ATGCATCAGATGCATGCATGACTACGACTACGATCAGCATCAGCGATCAGCATCGATACGATC" seq = Seq(seq) seq2 = SeqWithQuality(name="seq1", seq=seq) seq1 = SeqWithQuality(name=seq2.name, seq=vec1.seq + seq2.seq + vec2.seq, description="hola") seq3 = strip_vector_by_alignment(seq1) seq3 = seq_trimmer(seq3) assert str(seq2.seq) == str(seq3.seq) assert seq3.description == "hola" fhand_vectors.seek(0) seq1 = SeqWithQuality(name=seq2.name, seq=vec1.seq + vec2.seq + seq2.seq) seq3 = strip_vector_by_alignment(seq1) seq3 = seq_trimmer(seq3) assert str(seq2.seq) == str(seq3.seq) # overlaping vectors fhand_vectors.seek(0) new_seq = vec1.seq[:-2] + vec2.seq + seq2.seq + vec2.seq seq1 = SeqWithQuality(name=seq2.name, seq=new_seq) seq3 = strip_vector_by_alignment(seq1) seq3 = seq_trimmer(seq3) assert str(seq2.seq) == str(seq3.seq) # Now only vectors fhand_vectors.seek(0) new_seq = vec1.seq + vec2.seq + vec2.seq seq1 = SeqWithQuality(name=seq2.name, seq=new_seq) seq3 = strip_vector_by_alignment(seq1) seq3 = seq_trimmer(seq3) assert seq3 is None # Now without vectors fhand_vectors.seek(0) seq1 = seq2 seq3 = strip_vector_by_alignment(seq1) seq3 = seq_trimmer(seq3) assert str(seq2.seq) == str(seq3.seq) fhand_vectors.seek(0) seq1 = SeqWithQuality(name=seq2.name, seq=vec1.seq[::-1] + vec2.seq + seq2.seq) seq3 = strip_vector_by_alignment(seq1) seq3 = seq_trimmer(seq3) assert str(seq2.seq) == str(seq3.seq) seq = "ATGCATCAGATGCATGCATGACTACGACTACGATCAGCATCAGCGATCAGCATCGATACGATC" seq = Seq(seq) seq2 = SeqWithQuality(name="seq1", seq=seq) # 'atcgatcgatagcatacgat atgcatcagatcgataaaga seq = "atcgatcgatagcataGgat" + seq2.seq + "atgGatcagatcgataaaga" seq1 = SeqWithQuality(name=seq2.name, seq=seq, description="hola") seq3 = strip_vector_by_alignment(seq1) seq3 = seq_trimmer(seq3) assert str(seq2.seq) == str(seq3.seq) long_adap = "atcgatcgatagcatacgatatcgatcgatagcatacgatatcgatcgatagcatacc" vec1 = SeqWithQuality(name="vec1", seq=Seq(long_adap)) fhand_vectors = temp_fasta_file([vec1]) try: create_adaptor_striper(fhand_vectors) self.fail("ValueError expected") except ValueError: pass