def main(): 'The main' # get parameters infhand, outfhand, rm_annots = set_parameters() # guess file format format_ = guess_seq_file_format(infhand) #remove annotations seqs = remove_annotation(infhand, format_, rm_annots) # write seqs in file write_seqs_in_file(seqs, seq_fhand=outfhand, format=format_)
def test_json_reader(): 'It tests the json sequence writer' #first we write some files seq0 = SeqWithQuality(seq=Seq('ATGATAGATAGATGF'), name='seq1') seq1 = SeqWithQuality(seq=Seq('GATACCA', DNAAlphabet()), name='seq2') fhand = tempfile.NamedTemporaryFile(suffix='.json') write_seqs_in_file([seq0, seq1], fhand, format='json') fhand.flush() #now we read them seqs = list(seqs_in_file(open(fhand.name))) assert seqs[0].seq == seq0.seq assert seqs[1].seq == seq1.seq assert str(seqs[1].seq.alphabet) == str(seq1.seq.alphabet)
def test_description_annotation_analysis(): "We can annotate with description" test_dir = NamedTemporaryDir() project_name = "backbone" arab_blastdb = join(TEST_DATA_DIR, "blast", "arabidopsis_genes+") config = { "blast": {"arabidopsis": {"path": arab_blastdb, "species": "arabidopsis"}}, "Annotation": {"description_annotation": {"description_databases": ["arabidopsis"]}}, "General_settings": {"threads": THREADS}, } settings_path = create_project(directory=test_dir.name, name=project_name, configuration=config) project_dir = join(test_dir.name, project_name) # some melon file to annotate input_dir = join(project_dir, BACKBONE_DIRECTORIES["annotation_input"]) os.makedirs(input_dir) seq_ = "AGGTGTCACCGTTCACGAGGGCGACTGGGACTCCCACGGGGCCATCAAGTCCTGGAACTACA" seq_ += "CATGCGGTCCTCTATCTCATTCTCTATTTGTATGAATATGTGTTTATTACTAGCTAGGGTTT" seq_ += "CTATTAATGAAAGGTTCATGTAAATATATGAAGATGGGAAGCAAGAGGTGTTCAAGGAGAAG" seq_ += "AGGGAGTTAGACGACCAGAAGAT" seq1 = SeqWithQuality(Seq(seq_), id="CUTC021854") seq2 = SeqWithQuality(Seq("Atagtagcatcagatgagcatcgacttctagctagctagct"), id="CUTC021853") write_seqs_in_file([seq1, seq2], open(join(input_dir, "melon.st_nucl.pl_454.fasta"), "a")) do_analysis(project_settings=settings_path, kind="annotate_descriptions", silent=True) repr_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "melon.st_nucl.pl_454.0.pickle") result = open(repr_fpath).read() # print result assert "yet another one" in result do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True) stats_fpath = join(project_dir, "annotations", "features", "stats", "melon.st_nucl.pl_454.txt") result = open(stats_fpath).read() expected = """Annotation statistics --------------------- Number of sequences: 2 Sequences with description: 1""" assert expected in result test_dir.close()
def test_pickle_writer(): 'It tests the pickle sequence writer' seq0 = SeqWithQuality(seq=Seq('ATGATAGATAGATGF'), name='seq1') alleles = {('G', 3): {}} filters = {'a_filter':{('param',):False}} snv_feature = SeqFeature(FeatureLocation(ExactPosition(3), ExactPosition(3)), type='snv', qualifiers={'alleles':alleles, 'filters':filters}) seq1 = SeqWithQuality(seq=Seq('GATACCA'), name='seq2', features=[snv_feature]) fhand = StringIO() write_seqs_in_file([seq0, seq1], fhand, format='pickle') #print fhand.getvalue() fhand.seek(0) seqs = list(seqs_in_file(fhand)) assert seqs[1].features[0].qualifiers['alleles'] == alleles assert seqs[1].features[0].qualifiers['filters'] == filters
def seqio(in_seq_fhand, out_seq_fhand, out_format, double_encoding=False, in_qual_fhand=None, out_qual_fhand=None, in_format=None): 'It converts format of the files' if not in_format: in_format = guess_seq_file_format(in_seq_fhand) if (in_qual_fhand is not None or out_qual_fhand is not None or in_format in ('repr', 'json', 'pickle') or out_format in ('repr', 'json', 'pickle')) : seqs = seqs_in_file(seq_fhand=in_seq_fhand, qual_fhand=in_qual_fhand, format=in_format, double_encoding=double_encoding) write_seqs_in_file(seqs, seq_fhand=out_seq_fhand, qual_fhand=out_qual_fhand, format=out_format) else: SeqIO.convert(in_seq_fhand, in_format, out_seq_fhand, out_format) out_seq_fhand.flush() if out_qual_fhand: out_qual_fhand.flush()
def test_transitive_clustering(self): 'We do a transitive clustering' blast_fhand = open(os.path.join(TEST_DATA_DIR, 'transitive_cluster.blastout.xml'), 'rt') filter1 = {'kind': 'score_threshold', 'score_key': 'similarity', 'min_score': 98, } filter2 = {'kind': 'min_length', 'min_num_residues': 50, 'length_in_query': True } filters = [filter1, filter2] clusters = do_transitive_clustering_on_blast(blast_fhand, filters) assert set([u'seq3', u'seq2', u'seq1']) in clusters assert set([u'seq4']) in clusters # with the secuences blast_fhand = open(os.path.join(TEST_DATA_DIR, 'transitive_cluster.blastout.xml'), 'rt') seqs_fhand = NamedTemporaryFile() seqs = [SeqWithQuality(name='seq1', seq=Seq('aa')), SeqWithQuality(name='seq2', seq=Seq('aa')), SeqWithQuality(name='seq3', seq=Seq('aa')), SeqWithQuality(name='seq4', seq=Seq('aa')), SeqWithQuality(name='seq5', seq=Seq('aa')), SeqWithQuality(name='seq6', seq=Seq('aa'))] write_seqs_in_file(seqs, seqs_fhand) clusters, no_matched = do_transitive_clustering_all(blast_fhand, seqs_fhand, filters) assert set([u'seq3', u'seq2', u'seq1']) in clusters assert set([u'seq4']) in clusters assert 'seq5' in no_matched assert 'seq6' in no_matched
def test_json_writer(): 'It tests the json sequence writer' seq0 = SeqWithQuality(seq=Seq('ATGATAGATAGATGF'), name='seq1') alleles = {('G', 3): {}} filters = {'a_filter':{('param',):False}} snv_feature = SeqFeature(FeatureLocation(ExactPosition(3), ExactPosition(3)), type='snv', qualifiers={'alleles':alleles, 'filters':filters}) seq1 = SeqWithQuality(seq=Seq('GATACCA'), name='seq2', features=[snv_feature]) fhand = StringIO() write_seqs_in_file([seq0, seq1], fhand, format='json') lines = fhand.getvalue().splitlines() struct1 = json.loads(lines[2]) assert struct1['seq']['seq'] == 'GATACCA' assert struct1['features'][0]['qualifiers']['alleles'].keys()[0] == "('G', 3)" fhand.seek(0) seqs = list(seqs_in_file(fhand)) assert seqs[1].features[0].qualifiers['alleles'] == alleles assert seqs[1].features[0].qualifiers['filters'] == filters
def write(self, item): '''It writes a line. The item should be a tuple with the kind and the information about the feature ''' if item is None: return kind, item = item if self._fhand.tell() == 0: if not self.version: self.version = _DEFAULT_WRITE_VERSION self._fhand.write('##gff-version %s\n' % self.version) if kind == METADATA: self._fhand.write('##' + item + '\n') elif kind == COMMENT: self._fhand.write('#' + item + '\n') elif kind == FEATURE: feature_line = self._feature_to_str(item) + '\n' self._fhand.write(feature_line) elif kind == FASTA: self._fhand.write('##FASTA\n') write_seqs_in_file(item, self._fhand, format='fasta') self._fhand.flush()
def test_ortholog_annotation_analysis(): "We can annotate orthologs" test_dir = NamedTemporaryDir() project_name = "backbone" config = { "blast": { "arabidopsis": {"path": "/path/to/tair", "species": "arabidopsis", "kind": "nucl"}, "arabidopsis2": {"path": "/path/to/tair2", "species": "arabidopsis2", "kind": "nucl"}, }, "Annotation": {"ortholog_annotation": {"ortholog_databases": ["arabidopsis", "arabidopsis2"]}}, "General_settings": {"threads": THREADS}, } settings_path = create_project(directory=test_dir.name, name=project_name, configuration=config) project_dir = join(test_dir.name, project_name) # create blast results melon_tair_blastdir = join(project_dir, "annotations", "blast", "melon.st_nucl.pl_454", "tair") melon_tair2_blastdir = join(project_dir, "annotations", "blast", "melon.st_nucl.pl_454", "tair2") os.makedirs(melon_tair_blastdir) os.makedirs(melon_tair2_blastdir) tair_melon_blastdir = join(project_dir, "annotations", "blast", "tair", "melon.st_nucl.pl_454") tair2_melon_blastdir = join(project_dir, "annotations", "blast", "tair2", "melon.st_nucl.pl_454") os.makedirs(tair_melon_blastdir) os.makedirs(tair2_melon_blastdir) blast_fname = BACKBONE_BASENAMES["blast_basename"] + ".tblastx.xml" shutil.copy(join(TEST_DATA_DIR, "melon_tair.xml"), join(melon_tair_blastdir, blast_fname)) shutil.copy(join(TEST_DATA_DIR, "melon_tair.xml"), join(melon_tair2_blastdir, blast_fname)) shutil.copy(join(TEST_DATA_DIR, "tair_melon.xml"), join(tair_melon_blastdir, blast_fname)) shutil.copy(join(TEST_DATA_DIR, "tair_melon.xml"), join(tair2_melon_blastdir, blast_fname)) # some melon file to annotate input_dir = join(project_dir, BACKBONE_DIRECTORIES["annotation_input"]) os.makedirs(input_dir) seq1 = SeqWithQuality(Seq("A"), id="melon1") seq2 = SeqWithQuality(Seq("A"), id="melon2") write_seqs_in_file([seq1, seq2], open(join(input_dir, "melon.st_nucl.pl_454.fasta"), "a")) do_analysis(project_settings=settings_path, kind="annotate_orthologs", silent=True) pickle_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "melon.st_nucl.pl_454.0.pickle") pickle = open(pickle_fpath).read() assert "arabidopsis-orthologs" in pickle assert "arabidopsis2-orthologs" in pickle do_analysis(project_settings=settings_path, kind="write_annotations", silent=True) orf_fpath = join(project_dir, "annotations", "features", "melon.st_nucl.pl_454.orthologs") assert os.path.exists(orf_fpath) assert "tair1" in open(orf_fpath).read() orf_fpath = join(project_dir, "annotations", "features", "melon.st_nucl.pl_454.orf") assert not os.path.exists(orf_fpath) do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True) stats_fpath = join(project_dir, "annotations", "features", "stats", "melon.st_nucl.pl_454.txt") result = open(stats_fpath).read() expected = """Orthologs _________ Sequences with arabidopsis orthologs: 2 Number of arabidopsis orthologs: 2 Sequences with arabidopsis2 orthologs: 2 Number of arabidopsis2 orthologs: 2""" assert expected in result test_dir.close()