def test_build_water_relations():
        '''it test the function that makes the relations between two sequences
         using a markx10 format file'''
        seq = 'ATGGCTTCATCCATTCTCTCATCCGCCGNTGTGGCCTTTGNCAACAGGGCTTCCCCTGCTCA'
        seq += 'AGCTAGCATGGGGGCACCATTCACTGGCCTAAAATCCGCCGCTGCTTTCCCNGTTTTATGTA'
        seq += 'CTGTTTTNACTCGCANGACCAACGACATCACCACTTTGGTTAGCAATGGGGGAAGAGTTCAG'
        seq += 'GGCNTGAAGGTGTGCCCACCACTTGGATTGAAGAAGTTCGAGACTCTTTCTTACCTTCCTGA'
        seq += 'TATGAGTAACGAGCAATTGGGAAAGGAAGTTGACTACCTTCTCAGGAAGGGATGGATTCCCT'
        seq += 'GCATTGAATTCGACATTCACAGTGGATTCGTTTACCGTGAGACCCACAGGTCACCAGGATAC'
        seq += 'TTCGATGGACGCTACTGGACCATGTGGAAGCTGCCCATGTTTGGCTGCACCGAT'

        seq2 = 'ATGGCTTCATCCATTCTCTCATCCGCCGNTGTGGCCTTTGNCAACAGGGCTTCCCTGCTCAA'
        seq2 += 'GCTAGCATGGGGGCACCATTCACTGGCCTAAAATCCGCCGCTGCTTTCCCNGTNACTCGCAN'
        seq2 += 'GACCAACGACATCACCACTTTGGTTAGCAATGGGGGAAGAGTTCAGGGCNTGAAGGTGTGCC'
        seq2 += 'CACCACTTGGATTGAAGAAGTTCGAGACTCTTTCTTACCTTCCTGATATGAGTAACGAGCAA'
        seq2 += 'TTGGGAAAGGAAGTTGACTACCTTCTCAGGAAGGGATGGATTCCCTGCATTGAATTCGACAT'
        seq2 += 'TCACAGTGGATTCGTTTACCGTGAGACCCACAGGTCACCAGGATACTTCGATGGACGCTAC'
        seq2 += 'TGGACCATGTGGAAGCTGCCCATGTTTGGCTGCACCGAT'

        subject_seq = SeqWithQuality(seq=Seq(seq), name='subject')
        query_seq = SeqWithQuality(seq=Seq(seq2), name='query')

        subject_fhand = temp_fasta_file(subject_seq)
        parameters = {'subject':subject_fhand.name}
        aligner = create_runner(tool='water', parameters=parameters)
        result_fhand = aligner(query_seq)['water']
        relations = build_relations_from_aligment(result_fhand,
                                                  query_name=query_seq.name,
                                                  subject_name=subject_seq.name)
        assert relations == {'query': [(0, 50), (51, 112), (113, 409)],
                             'subject': [(0, 50), (52, 113), (129, 425)]}
Ejemplo n.º 2
0
 def test_temp_fasta_file_seq_iter():
     'It test temp_fasta_file'
     seqrec1 = SeqWithQuality(seq=Seq('ATGATAGATAGATGF'), name='seq1')
     seqrec2 = SeqWithQuality(seq=Seq('ATGATAGATAGA'), name='seq2')
     seq_iter = iter([seqrec1, seqrec2])
     fhand = temp_fasta_file(seq_iter)
     content = open(fhand.name).read()
     assert content == ">seq1\nATGATAGATAGATGF\n>seq2\nATGATAGATAGA\n"
Ejemplo n.º 3
0
def infer_introns_for_cdna(sequence, genomic_db, genomic_seqs_index=None, similar_sequence=None):
    "It infers the intron location in the cdna using est2genome"

    if not similar_sequence:
        # first we want to know where is the most similar seq in the genomic_db
        # this will speed up things
        similar_seqs = look_for_similar_sequences(sequence, database=genomic_db, blast_program="blastn")
        if not similar_seqs:
            return []
        similar_seq = similar_seqs[0]
    else:
        similar_seq = similar_sequence
    start = similar_seq["subject_start"]
    end = similar_seq["subject_end"]
    try:
        similar_seq = genomic_seqs_index[similar_seq["name"]]
    except KeyError:
        msg = "Sequence %s was not found" % similar_seq["name"]
        raise KeyError(msg)

    # now we run est2genome for this cdna
    cdna_file = temp_fasta_file(seqs=[sequence])
    similar_seq_file = temp_fasta_file(seqs=[similar_seq])

    # we run est2genome
    cmd = [
        "est2genome",
        cdna_file.name,
        similar_seq_file.name,
        "-sbegin2",
        str(start),
        "-send2",
        str(end),
        "-stdout",
        "-auto",
    ]
    stdout, stderr, retcode = call(cmd)

    if retcode:
        msg = "There was an error running est2genome: " + stderr
        raise RuntimeError(msg)

    # parse est2genome
    result = est2genome_parser(stdout)
    # get_introns_from parser_result
    return result["cdna"]["introns"]
Ejemplo n.º 4
0
def infer_introns_for_cdna(sequence, genomic_db, genomic_seqs_index=None,
                           similar_sequence=None):
    'It infers the intron location in the cdna using est2genome'

    if not similar_sequence:
        #first we want to know where is the most similar seq in the genomic_db
        #this will speed up things
        similar_seqs = look_for_similar_sequences(sequence, database=genomic_db,
                                                  blast_program='blastn')
        if not similar_seqs:
            return []
        similar_seq = similar_seqs[0]
    else:
        similar_seq = similar_sequence
    start = similar_seq['subject_start']
    end = similar_seq['subject_end']
    try:
        similar_seq = genomic_seqs_index[similar_seq['name']]
    except KeyError:
        msg = 'Sequence %s was not found' % similar_seq['name']
        raise KeyError(msg)

    #now we run est2genome for this cdna
    cdna_file = temp_fasta_file(seqs=[sequence])
    similar_seq_file = temp_fasta_file(seqs=[similar_seq])

    #we run est2genome
    cmd = ['est2genome', cdna_file.name, similar_seq_file.name,
           '-sbegin2', str(start), '-send2', str(end), '-stdout', '-auto']
    stdout, stderr, retcode = call(cmd)

    if retcode:
        msg = 'There was an error running est2genome: ' + stderr
        raise RuntimeError(msg)

    #parse est2genome
    result = est2genome_parser(stdout)

    #get_introns_from parser_result
    return result['cdna']['introns']
Ejemplo n.º 5
0
def _prepare_input_files(inputs, seqs):
    'It prepares inputs taking into account the format'
    for key, value in inputs.items():
        files_format = value['files_format']
        inputs[key]['fhands'] = []
        inputs[key]['fpaths'] = []
        seqs, seqs_qual = itertools.tee(seqs, 2)
        for file_format in files_format:
            if file_format == 'fasta':
                fhand = temp_fasta_file(seqs=seqs)
            elif file_format == 'qual':
                fhand = temp_qual_file(seqs=seqs_qual)
            inputs[key]['fhands'].append(fhand)
            inputs[key]['fpaths'].append(fhand.name)
Ejemplo n.º 6
0
def _prepare_input_files(inputs, seqs):
    "It prepares inputs taking into account the format"
    for key, value in inputs.items():
        files_format = value["files_format"]
        inputs[key]["fhands"] = []
        inputs[key]["fpaths"] = []
        seqs, seqs_qual = itertools.tee(seqs, 2)
        for file_format in files_format:
            if file_format == "fasta":
                fhand = temp_fasta_file(seqs=seqs)
            elif file_format == "qual":
                fhand = temp_qual_file(seqs=seqs_qual)
            inputs[key]["fhands"].append(fhand)
            inputs[key]["fpaths"].append(fhand.name)
Ejemplo n.º 7
0
 def test_temp_fasta_file_one_seq():
     'It test temp_fasta_file'
     seqrec1 = SeqWithQuality(seq=Seq('ATGATAGATAGATGF'), name='seq1')
     fhand = temp_fasta_file(seqrec1)
     content = open(fhand.name).read()
     assert content == ">seq1\nATGATAGATAGATGF\n"
Ejemplo n.º 8
0
def _seq_to_fasta_fhand(seq):
    'Given a fhand or Seq object it returns a fhand'
    if 'file' in seq.__class__.__name__.lower():
        return seq
    return temp_fasta_file(seqs=[seq])
Ejemplo n.º 9
0
    def test_strip_adaptor_blast(self):
        "It tests strip_vector_by_alignment with blastn-short"

        vec1 = SeqWithQuality(name="vec1", seq=Seq("atcgatcgatagcatacgat"))
        vec2 = SeqWithQuality(name="vec2", seq=Seq("atgcatcagatcgataaaga"))
        fhand_vectors = temp_fasta_file([vec1, vec2])
        seq_trimmer = create_seq_trim_and_masker()
        strip_vector_by_alignment = create_adaptor_striper(fhand_vectors)

        seq = "ATGCATCAGATGCATGCATGACTACGACTACGATCAGCATCAGCGATCAGCATCGATACGATC"
        seq = Seq(seq)
        seq2 = SeqWithQuality(name="seq1", seq=seq)
        seq1 = SeqWithQuality(name=seq2.name, seq=vec1.seq + seq2.seq + vec2.seq, description="hola")

        seq3 = strip_vector_by_alignment(seq1)
        seq3 = seq_trimmer(seq3)
        assert str(seq2.seq) == str(seq3.seq)
        assert seq3.description == "hola"

        fhand_vectors.seek(0)
        seq1 = SeqWithQuality(name=seq2.name, seq=vec1.seq + vec2.seq + seq2.seq)
        seq3 = strip_vector_by_alignment(seq1)
        seq3 = seq_trimmer(seq3)
        assert str(seq2.seq) == str(seq3.seq)

        # overlaping vectors
        fhand_vectors.seek(0)
        new_seq = vec1.seq[:-2] + vec2.seq + seq2.seq + vec2.seq
        seq1 = SeqWithQuality(name=seq2.name, seq=new_seq)
        seq3 = strip_vector_by_alignment(seq1)
        seq3 = seq_trimmer(seq3)
        assert str(seq2.seq) == str(seq3.seq)

        # Now only vectors
        fhand_vectors.seek(0)
        new_seq = vec1.seq + vec2.seq + vec2.seq
        seq1 = SeqWithQuality(name=seq2.name, seq=new_seq)
        seq3 = strip_vector_by_alignment(seq1)
        seq3 = seq_trimmer(seq3)
        assert seq3 is None

        # Now without vectors
        fhand_vectors.seek(0)
        seq1 = seq2
        seq3 = strip_vector_by_alignment(seq1)
        seq3 = seq_trimmer(seq3)
        assert str(seq2.seq) == str(seq3.seq)

        fhand_vectors.seek(0)
        seq1 = SeqWithQuality(name=seq2.name, seq=vec1.seq[::-1] + vec2.seq + seq2.seq)
        seq3 = strip_vector_by_alignment(seq1)
        seq3 = seq_trimmer(seq3)
        assert str(seq2.seq) == str(seq3.seq)

        seq = "ATGCATCAGATGCATGCATGACTACGACTACGATCAGCATCAGCGATCAGCATCGATACGATC"
        seq = Seq(seq)
        seq2 = SeqWithQuality(name="seq1", seq=seq)
        #     'atcgatcgatagcatacgat                atgcatcagatcgataaaga
        seq = "atcgatcgatagcataGgat" + seq2.seq + "atgGatcagatcgataaaga"
        seq1 = SeqWithQuality(name=seq2.name, seq=seq, description="hola")
        seq3 = strip_vector_by_alignment(seq1)
        seq3 = seq_trimmer(seq3)
        assert str(seq2.seq) == str(seq3.seq)

        long_adap = "atcgatcgatagcatacgatatcgatcgatagcatacgatatcgatcgatagcatacc"
        vec1 = SeqWithQuality(name="vec1", seq=Seq(long_adap))
        fhand_vectors = temp_fasta_file([vec1])
        try:
            create_adaptor_striper(fhand_vectors)
            self.fail("ValueError expected")
        except ValueError:
            pass