def __init__(self, ref_genome="at_tair10"): if ref_genome == "at_tair10": self.chrs = ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5'] self.def_color = [ "#1f78b4", "#33a02c", "#1f78b4", "#33a02c", "#1f78b4" ] self.real_chrlen = [ 34964571, 22037565, 25499034, 20862711, 31270811 ] self.golden_chrlen = [ 30427671, 19698289, 23459830, 18585056, 26975502 ] self.centro_start = [ 14364752, 3602775, 12674550, 2919690, 11668616 ] self.centro_end = [15750321, 3735247, 13674767, 4011692, 12082583] self.cetro_mid = np.add(self.centro_start, self.centro_end) / 2 elif os.path.exists(ref_genome): ## Provide a fasta file to check for genome lengths etc from pyfaidx import Faidx genome = Faidx(ref_genome).index self.chrs = np.sort(np.array(genome.keys())).tolist() self.real_chrlen = [genome[ef].rlen for ef in self.chrs] self.golden_chrlen = self.real_chrlen self.chr_inds = np.append(0, np.cumsum(self.golden_chrlen))
def test_fetch_border(self): """ Fetch past the end of a gene entry """ faidx = Faidx('data/genes.fasta') expect = 'TC' result = faidx.fetch('gi|557361099|gb|KF435150.1|', 480, 500) assert str(result) == expect
def test_fetch_border_padded(self): """ Fetch past the end of a gene entry """ faidx = Faidx('data/genes.fasta.gz', default_seq='N') expect = 'TCNNNNNNNNNNNNNNNNNNN' result = faidx.fetch('gi|557361099|gb|KF435150.1|', 480, 500) print(result) assert str(result) == expect
def test_build_issue_111(self): expect_index = ("gi|563317589|dbj|AB821309 3510 114 70 71\n" "gi|557361099|gb|KF435150 481 3789 70 71\n" "gi|557361097|gb|KF435149 642 4368 70 71\n" "gi|543583796|ref|NR_104216 4573 5141 70 71\n" "gi|543583795|ref|NR_104215 5317 9901 70 71\n" "gi|543583794|ref|NR_104212 5374 15415 70 71\n" "gi|543583788|ref|NM_001282545 4170 20980 70 71\n" "gi|543583786|ref|NM_001282543 5466 25324 70 71\n" "gi|543583785|ref|NM_000465 5523 30980 70 71\n" "gi|543583740|ref|NM_001282549 3984 36696 70 71\n" "gi|543583738|ref|NM_001282548 4113 40851 70 71\n" "gi|530384540|ref|XM_005249645 2752 45151 70 71\n" "gi|530384538|ref|XM_005249644 3004 48071 70 71\n" "gi|530384536|ref|XM_005249643 3109 51246 70 71\n" "gi|530384534|ref|XM_005249642 3097 54528 70 71\n" "gi|530373237|ref|XM_005265508 2794 57830 70 71\n" "gi|530373235|ref|XM_005265507 2848 60824 70 71\n" "gi|530364726|ref|XR_241081 1009 63849 70 71\n" "gi|530364725|ref|XR_241080 4884 65009 70 71\n" "gi|530364724|ref|XR_241079 2819 70099 70 71\n") index = Faidx('data/genes.fasta', read_long_names=True, key_function=lambda x: x.split('.')[0]) result_index = ''.join(index._index_as_string()) assert result_index == expect_index
def setUpClass(cls): cls.dir = tempfile.mkdtemp() # create a fasta file cls.fa = os.path.join(cls.dir, 'genome.fa') with open(cls.fa, mode='wt') as handle: handle.write('>chr1\n') handle.write('ACTGATGCTAGCTAGTATCTGACTCAGTAGCTCGAT\n') # index the fasta file fai = Faidx(cls.fa) fai.close() # set the final args that depend on the temp directory get_options.set_attr('tempdir', cls.dir) get_options.set_attr('reference', cls.fa) outvcf = os.path.join(cls.dir, 'out.vcf.gz') invcf = os.path.join(cls.dir, 'in.vcf.gz') get_options.set_attr('vcf', invcf) get_options.set_attr('out', outvcf) # write a VCF to be converted. This includes one variant which cannot be # converted. TODO: make a unit test to check for expected log output for # unconvertible variant with gzip.open(invcf, 'wt') as handle: handle.write('##fileformat=VCFv4.1\n' \ '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n' \ '1\t10\t.\tT\tG\t100\tPASS\tAC=100\n' \ '1\t1000000\t.\tT\tG\t100\tPASS\tAC=100\n' \ '1\t2000000\t.\tA\tG\t100\tPASS\tAC=100\n')
def test_key_function_by_fetch(self): faidx = Faidx('data/genes.fasta', split_char='|', duplicate_action="drop") expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA' result = faidx.fetch('KF435150.1', 100, 150) assert str(result) == expect
def test_reindex_on_modification(self): """ This test ensures that the index is regenerated when the FASTA modification time is newer than the index modification time. mdshw5/pyfaidx#50 """ faidx = Faidx('data/genes.fasta') index_mtime = getmtime(faidx.indexname) faidx.close() os.utime('data/genes.fasta', (index_mtime + 10, ) * 2) time.sleep(2) faidx = Faidx('data/genes.fasta') assert getmtime(faidx.indexname) > index_mtime
class Genome(object): def __init__(self, db): from pyfaidx import Faidx fa = os.path.join(app.config["DATA_FOLDER"], db, db + ".fa") self.fasta = Faidx(fa) def get_sequence(self, chr, start, end): return self.fasta.fetch(chr, start, end) def destroy(self): self.fasta.close()
def setUpClass(cls): cls.dir = tempfile.mkdtemp() # create a fasta file cls.fa = os.path.join(cls.dir, 'genome.fa') with open(cls.fa, mode='wt') as handle: handle.write('>chrN\n') handle.write('NNTGATGCTAGCTAGTATCTG\n') # index the fasta file fai = Faidx(cls.fa) fai.close()
def test_fetch_whole_entry(self): faidx = Faidx('data/genes.fasta.gz') expect = ('ATGACATCATTTTCCACCTCTGCTCAGTGTTCAACATCTGA' 'CAGTGCTTGCAGGATCTCTCCTGGACAAATCAATCAGGTACGACCA' 'AAACTGCCGCTTTTGAAGATTTTGCATGCAGCAGGTGCGCAAGG' 'TGAAATGTTCACTGTTAAAGAGGTCATGCACTATTTAGGTCAGTACAT' 'AATGGTGAAGCAACTTTATGATCAGCAGGAGCAGCATATGGTATATTG' 'TGGTGGAGATCTTTTGGGAGAACTACTGGGACGTCAGAGCTTCTCCGTG' 'AAAGACCCAAGCCCTCTCTATGATATGCTAAGAAAGAATCTTGTCACTTT' 'AGCCACTGCTACTACAGCAAAGTGCAGAGGAAAGTTCCACTTCCAGAAAAA' 'GAACTACAGAAGACGATATCCCCACACTGCCTACCTCAGAGCATAAATGCA' 'TACATTCTAGAGAAGGTGATTGAAGTGGGAAAAAATGATGACCTGGAGGACTC') result = faidx.fetch('gi|557361099|gb|KF435150.1|', 1, 481) assert str(result) == expect
def load_seqs_and_annotations(protein_annotations_sqlite_db_file_path, fasta_file_path, shuffle = True, records_limit = None, verbose = True, \ log_progress_every = 10000): if verbose: log('Loading %s records...' % ('all' if records_limit is None else records_limit)) conn = sqlite3.connect(protein_annotations_sqlite_db_file_path) raw_proteins_and_annotations = pd.read_sql_query('SELECT uniprot_name, complete_go_annotation_indices FROM protein_annotations' + ('' if records_limit is None else \ (' LIMIT %d' % records_limit)), conn) if verbose: log('Loaded %d proteins and their GO annotations (%d columns: %s)' % (raw_proteins_and_annotations.shape + (', '.join(raw_proteins_and_annotations.columns), ))) if shuffle: raw_proteins_and_annotations = raw_proteins_and_annotations.sample( frac=1, random_state=0) if verbose: log('Loading Faidx (%s)...' % fasta_file_path) seqs_faidx = Faidx(fasta_file_path) if verbose: log('Finished loading Faidx.') n_failed = 0 for i, (_, (uniprot_id, raw_go_annotation_indices)) in enumerate( raw_proteins_and_annotations.iterrows()): if verbose and i % log_progress_every == 0: log('%d/%d' % (i, len(raw_proteins_and_annotations)), end='\r') seq_fasta_id = 'UniRef90_%s' % uniprot_id.split('_')[0] try: seq = str( seqs_faidx.fetch(seq_fasta_id, 1, seqs_faidx.index[seq_fasta_id].rlen)) yield uniprot_id, seq, json.loads(raw_go_annotation_indices) except KeyError: n_failed += 1 if verbose: log('Finished. Failed finding the sequence for %d of %d records.' % (n_failed, len(raw_proteins_and_annotations)))
def test_build_issue_141(self): expect_index = ("gi|563317589|dbj|AB821309.1| 3510 115 70 72\n" "gi|557361099|gb|KF435150.1| 481 3842 70 72\n" "gi|557361097|gb|KF435149.1| 642 4429 70 72\n" "gi|543583796|ref|NR_104216.1| 4573 5213 70 72\n" "gi|543583795|ref|NR_104215.1| 5317 10040 70 72\n" "gi|543583794|ref|NR_104212.1| 5374 15631 70 72\n" "gi|543583788|ref|NM_001282545.1| 4170 21274 70 72\n" "gi|543583786|ref|NM_001282543.1| 5466 25679 70 72\n" "gi|543583785|ref|NM_000465.3| 5523 31415 70 72\n" "gi|543583740|ref|NM_001282549.1| 3984 37211 70 72\n" "gi|543583738|ref|NM_001282548.1| 4113 41424 70 72\n" "gi|530384540|ref|XM_005249645.1| 2752 45784 70 72\n" "gi|530384538|ref|XM_005249644.1| 3004 48745 70 72\n" "gi|530384536|ref|XM_005249643.1| 3109 51964 70 72\n" "gi|530384534|ref|XM_005249642.1| 3097 55292 70 72\n" "gi|530373237|ref|XM_005265508.1| 2794 58640 70 72\n" "gi|530373235|ref|XM_005265507.1| 2848 61675 70 72\n" "gi|530364726|ref|XR_241081.1| 1009 64742 70 72\n" "gi|530364725|ref|XR_241080.1| 4884 65918 70 72\n" "gi|530364724|ref|XR_241079.1| 2819 71079 70 72\n") index_file = Faidx('data/issue_141.fasta').indexname result_index = open(index_file).read() os.remove('data/issue_141.fasta.fai') print(result_index) assert result_index == expect_index
def test_fetch_whole_entry(self): faidx = Faidx("data/genes.fasta") expect = ( "ATGACATCATTTTCCACCTCTGCTCAGTGTTCAACATCTGA" "CAGTGCTTGCAGGATCTCTCCTGGACAAATCAATCAGGTACGACCA" "AAACTGCCGCTTTTGAAGATTTTGCATGCAGCAGGTGCGCAAGG" "TGAAATGTTCACTGTTAAAGAGGTCATGCACTATTTAGGTCAGTACAT" "AATGGTGAAGCAACTTTATGATCAGCAGGAGCAGCATATGGTATATTG" "TGGTGGAGATCTTTTGGGAGAACTACTGGGACGTCAGAGCTTCTCCGTG" "AAAGACCCAAGCCCTCTCTATGATATGCTAAGAAAGAATCTTGTCACTTT" "AGCCACTGCTACTACAGCAAAGTGCAGAGGAAAGTTCCACTTCCAGAAAAA" "GAACTACAGAAGACGATATCCCCACACTGCCTACCTCAGAGCATAAATGCA" "TACATTCTAGAGAAGGTGATTGAAGTGGGAAAAAATGATGACCTGGAGGACTC" ) result = faidx.fetch("gi|557361099|gb|KF435150.1|", 1, 482) assert str(result) == expect
def test_build_issue_96_fail_read_malformed_index_duplicate_key(self): """ Ensure that the fasta file is closed if construction of the 'Faidx' file fails when attempting to read a pre-existing index. The index is malformed because it contains mulitple occurrences of the same index. See mdshw5/pyfaidx#96 """ tmp_dir = mkdtemp() try: fasta_path = os.path.join(tmp_dir, 'issue_96.fasta') faidx_path = os.path.join(tmp_dir, 'issue_96.fasta.fai') # Write simple fasta file with open(fasta_path, 'w') as fasta_out: fasta_out.write(">seq1\nCTCCGGGCCCAT\nATAAAGCCTAAA\n") with open(faidx_path, 'w') as faidx_out: faidx_out.write("seq1\t24\t6\t12\t13\nseq1\t24\t6\t12\t13\n") builtins_open = builtins.open opened_files = [] def test_open(*args, **kwargs): f = builtins_open(*args, **kwargs) opened_files.append(f) return f with mock.patch('six.moves.builtins.open', side_effect=test_open): try: Faidx(fasta_path) self.assertFail( "Faidx construction should fail with 'ValueError'.") except ValueError: pass self.assertTrue(all(f.closed for f in opened_files)) finally: shutil.rmtree(tmp_dir)
def test_valgrind_blank_lines(self): """ Makes all full-length lines blank and checks that error is raised in all appropriate circumstances. """ # http://stackoverflow.com/a/23212515/717419 if platform.system() == 'Windows': raise SkipTest indexed = [] with open('data/genes.fasta') as genes: fasta = genes.readlines() n_lines = sum(1 for line in fasta) for n in range(n_lines): with NamedTemporaryFile(mode='w') as lines: for i, line in enumerate(fasta): if i == n and line[0] != '>' and len(line) == 71: line = '\n' full_line = True elif i == n: full_line = False lines.write(line) lines.flush() name = lines.name if full_line: try: Faidx(name) indexed.append(True) except FastaIndexingError: indexed.append(False) assert not any(indexed)
def test_build_issue_126(self): """ Samtools BGZF index should be identical to pyfaidx BGZF index """ expect_index = ("gi|563317589|dbj|AB821309.1| 3510 114 70 71\n" "gi|557361099|gb|KF435150.1| 481 3789 70 71\n" "gi|557361097|gb|KF435149.1| 642 4368 70 71\n" "gi|543583796|ref|NR_104216.1| 4573 5141 70 71\n" "gi|543583795|ref|NR_104215.1| 5317 9901 70 71\n" "gi|543583794|ref|NR_104212.1| 5374 15415 70 71\n" "gi|543583788|ref|NM_001282545.1| 4170 20980 70 71\n" "gi|543583786|ref|NM_001282543.1| 5466 25324 70 71\n" "gi|543583785|ref|NM_000465.3| 5523 30980 70 71\n" "gi|543583740|ref|NM_001282549.1| 3984 36696 70 71\n" "gi|543583738|ref|NM_001282548.1| 4113 40851 70 71\n" "gi|530384540|ref|XM_005249645.1| 2752 45151 70 71\n" "gi|530384538|ref|XM_005249644.1| 3004 48071 70 71\n" "gi|530384536|ref|XM_005249643.1| 3109 51246 70 71\n" "gi|530384534|ref|XM_005249642.1| 3097 54528 70 71\n" "gi|530373237|ref|XM_005265508.1| 2794 57830 70 71\n" "gi|530373235|ref|XM_005265507.1| 2848 60824 70 71\n" "gi|530364726|ref|XR_241081.1| 1009 63849 70 71\n" "gi|530364725|ref|XR_241080.1| 4884 65009 70 71\n" "gi|530364724|ref|XR_241079.1| 2819 70099 70 71\n") index_file = Faidx('data/genes.fasta.gz').indexname result_index = open(index_file).read() assert result_index == expect_index
class TestFeatureKeyFunction: def __init__(self): self.fasta = os.path.join(path, 'data/genes.fasta') self.faidx = Faidx(self.fasta, key_function=get_gene_name) self.genes = Fasta(self.fasta, key_function=get_gene_name) def test_keys(self): expect = ['BARD1', 'FGFR2', 'KF435149.1', 'MDM4', 'NM_000465.3', 'NM_001282543.1', 'NM_001282545.1', 'NM_001282548.1', 'NM_001282549.1', 'NR_104212.1', 'NR_104215.1', 'XM_005249642.1', 'XM_005249643.1', 'XM_005249644.1', 'XM_005249645.1', 'XM_005265507.1', 'XM_005265508.1', 'XR_241079.1', 'XR_241080.1', 'XR_241081.1'] result = sorted(self.genes.keys()) assert result == expect def test_key_function_by_dictionary_get_key(self): expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA' result = self.genes['MDM4'][100-1:150] assert str(result) == expect def test_key_function_by_fetch(self): expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA' result = self.faidx.fetch('MDM4', 100, 150) assert str(result) == expect @raises(ValueError) def test_duplicated_keys(self): genes = Fasta(self.fasta, key_function=get_duplicated_gene_name)
def test_build_issue_96_fail_build_faidx(self): """ Ensure that the fasta file is closed if construction of the 'Faidx' file when attempting to build an index. See mdshw5/pyfaidx#96 """ tmp_dir = mkdtemp() try: fasta_path = os.path.join(tmp_dir, 'issue_96.fasta') # Write simple fasta file with inconsistent sequence line lengths, # so building an index raises a 'FastaIndexingError' with open(fasta_path, 'w') as fasta_out: fasta_out.write( ">seq1\nCTCCGGGCCCAT\nAACACTTGGGGGTAGCTAAAGTGAA\nATAAAGCCTAAA\n" ) builtins_open = builtins.open opened_files = [] def test_open(*args, **kwargs): f = builtins_open(*args, **kwargs) opened_files.append(f) return f with mock.patch('six.moves.builtins.open', side_effect=test_open): try: Faidx(fasta_path) self.assertFail( "Faidx construction should fail with 'FastaIndexingError'." ) except FastaIndexingError: pass self.assertTrue(all(f.closed for f in opened_files)) finally: shutil.rmtree(tmp_dir)
def split_target_sequence(target_chroms, target_fasta_name, inter_files): Faidx(target_fasta_name) target_fasta_dict = Fasta(target_fasta_name, key_function=lambda x: x.split()[0]) for chrm in target_chroms: if chrm != target_fasta_name: out = open(inter_files + "/" + chrm + ".fa", 'w') out.write(">" + chrm + "\n" + str(target_fasta_dict[chrm])) return target_fasta_dict
def split_target_sequence(target_chroms, target_fasta_name): Faidx(target_fasta_name) target_fasta = Fasta(target_fasta_name, key_function=lambda x: x.split()[0]) for chrm in target_chroms: if chrm != target_fasta_name: out = open(chrm + ".fa", 'w') out.write(">" + chrm + "\n" + str(target_fasta[chrm])) out.close()
def get_gene_sequences(parent_dict, ref_chroms, reference_fasta_name, processes, inter_files, liftover_type): pool = Pool(processes) Faidx(reference_fasta_name) func = partial(get_gene_sequences_subset, parent_dict, reference_fasta_name, inter_files, liftover_type) for result in pool.imap_unordered(func, ref_chroms): continue pool.close() pool.join() return
def test_build_issue_83(self): """ Ensure that blank lines between entries are treated in the same way as samtools 1.2. See mdshw5/pyfaidx#83. """ expect_index = ("MT 119 4 70 71\nGL000207.1 60 187 60 61\n") index_file = Faidx('data/issue_83.fasta').indexname result_index = open(index_file).read() os.remove('data/issue_83.fasta.fai') assert result_index == expect_index
def get_batches(NUM_BATCHES, GENOME_FASTA): # Return a 3-level list(ref): partitions -> chunks -> chunk properties (scaffold + coordinates) PARTS = [] GENOME_NAME = os.path.basename(GENOME_FASTA).split(".")[0] TOTAL_SIZE = 0 SEQS = {} FAIDX = Faidx(GENOME_FASTA) FASTA_IDX = GENOME_FASTA + ".fai" with open(FASTA_IDX) as FILE: for LINE in FILE: LINE = LINE.rstrip() SEQ, SEQ_SIZE, JUNK = LINE.split("\t", 2) TOTAL_SIZE += int(SEQ_SIZE) SEQS[SEQ] = int(SEQ_SIZE) if NUM_BATCHES > 0: CHUNK_SIZE = int(TOTAL_SIZE / NUM_BATCHES) + 1 BATCHES = [] CURRENT_BATCH_SIZE = 0 for SCAFFOLD in SEQS: SEQ_SIZE = SEQS[SCAFFOLD] SEQ_IDX = 0 while SEQ_SIZE > 0: if (CURRENT_BATCH_SIZE + SEQ_SIZE) > CHUNK_SIZE: FILL_SIZE = CHUNK_SIZE - CURRENT_BATCH_SIZE CHUNK_INFO = str(GENOME_NAME + ":" + SCAFFOLD + ":" + str(SEQ_IDX) + "-" + str(SEQ_SIZE)) #NOTE: For scaffold size, always refer back to the index dict, not SEQ_SIZE, # since SEQ_SIZE changes depending on if the whole scaffold was used in # a single batch or not (as in the if statement of this loop) PARTS.append( [SCAFFOLD, SEQS[SCAFFOLD], SEQ_IDX, FILL_SIZE, CHUNK_INFO]) BATCHES.append([PARTS]) PARTS = [] SEQ_IDX += FILL_SIZE SEQ_SIZE -= FILL_SIZE CURRENT_BATCH_SIZE = 0 else: CHUNK_INFO = str(GENOME_NAME + ":" + SCAFFOLD + ":" + str(SEQ_IDX) + "-" + str(SEQ_SIZE)) PARTS.append( [SCAFFOLD, SEQS[SCAFFOLD], SEQ_IDX, SEQ_SIZE, CHUNK_INFO]) CURRENT_BATCH_SIZE += SEQ_SIZE SEQ_SIZE = 0 #unclear if BATCHES will be in the appropriate hierarchy of lists/parts(elements) atm # This bit must be outside of the for loop, otherwise each iteration thru the loop # will append the current PARTS list to BATCHES x# of scaffolds in the PARTS list if PARTS: BATCHES.append([PARTS]) return BATCHES
def split_target_sequence(target_chroms, target_fasta_name, inter_files): Faidx(target_fasta_name) genome_size =0 target_fasta = Fasta(target_fasta_name, key_function = lambda x: x.split()[0]) for value in target_fasta.values(): genome_size += len(value) for chrm in target_chroms: if chrm != target_fasta_name: out=open( inter_files + "/" + chrm+".fa", 'w') out.write(">" + chrm + "\n" + str(target_fasta[chrm])) return genome_size
class TestFeatureBoundsCheck: def __init__(self): self.fasta = os.path.join(path, 'data/genes.fasta') self.faidx = Faidx(self.fasta, default_seq='N') def test_fetch_border_padded(self): """ Fetch past the end of a gene entry """ expect = 'TCNNNNNNNNNNNNNNNNNNN' result = self.faidx.fetch('KF435150.1', 480, 500) assert str(result) == expect
def test_issue_144_no_defline(self): """ Ensure that an exception is raised when a file contains no deflines. See mdshw5/pyfaidx#144. """ tmp_dir = mkdtemp() try: fasta_path = os.path.join(tmp_dir, 'issue_144.fasta') # Write simple fasta file with open(fasta_path, 'w') as fasta_out: fasta_out.write("CTCCGGGCCCAT\nATAAAGCCTAAA\n") faidx = Faidx(fasta_path) finally: shutil.rmtree(tmp_dir)
def test_read_back_index(self): """Ensure that index files written with write_fai() can be read back""" import locale old_locale = locale.getlocale(locale.LC_NUMERIC) try: locale.setlocale(locale.LC_NUMERIC, 'en_US.utf8') faidx = Faidx('data/genes.fasta') faidx.write_fai() faidx = Faidx('data/genes.fasta', build_index=False) finally: locale.setlocale(locale.LC_NUMERIC, old_locale)
class TestFeatureBoundsCheck: def __init__(self): self.fasta = os.path.join(path, 'data/genes.fasta') self.faidx = Faidx(self.fasta) self.faidx_strict = Faidx(self.fasta, strict_bounds=True) def test_fetch_whole_entry(self): expect = ('ATGACATCATTTTCCACCTCTGCTCAGTGTTCAACATCTGA' 'CAGTGCTTGCAGGATCTCTCCTGGACAAATCAATCAGGTACGACCA' 'AAACTGCCGCTTTTGAAGATTTTGCATGCAGCAGGTGCGCAAGG' 'TGAAATGTTCACTGTTAAAGAGGTCATGCACTATTTAGGTCAGTACAT' 'AATGGTGAAGCAACTTTATGATCAGCAGGAGCAGCATATGGTATATTG' 'TGGTGGAGATCTTTTGGGAGAACTACTGGGACGTCAGAGCTTCTCCGTG' 'AAAGACCCAAGCCCTCTCTATGATATGCTAAGAAAGAATCTTGTCACTTT' 'AGCCACTGCTACTACAGCAAAGTGCAGAGGAAAGTTCCACTTCCAGAAAAA' 'GAACTACAGAAGACGATATCCCCACACTGCCTACCTCAGAGCATAAATGCA' 'TACATTCTAGAGAAGGTGATTGAAGTGGGAAAAAATGATGACCTGGAGGACTC') result = self.faidx.fetch('KF435150.1', 1, 482) assert str(result) == expect def test_fetch_middle(self): expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA' result = self.faidx.fetch('KF435150.1', 100, 150) assert str(result) == expect def test_fetch_end(self): expect = 'TC' result = self.faidx.fetch('KF435150.1', 480, 482) assert str(result) == expect def test_fetch_border(self): """ Fetch past the end of a gene entry """ expect = 'TC' result = self.faidx.fetch('KF435150.1', 480, 500) assert str(result) == expect def test_rev(self): expect = 'GA' result = self.faidx.fetch('KF435150.1', 480, 482) assert str(-result) == expect, result @raises(FetchError) def test_fetch_past_bounds(self): """ Fetch past the end of a gene entry """ expect = 'TC' result = self.faidx_strict.fetch('KF435150.1', 480, 5000)
def get_transcriptome(fa_input, fa_output, fai_output, log): logging.basicConfig(filename=log, filemode="w", level=logging.INFO, format='%(message)s') logging.info("timestamp: {}".format(str(datetime.datetime.now()))) for i, j in locals().items(): logging.info("\t{}: {}\n".format(i,j)) try: # Parse fasta file uncompress and simplify transcript ids logging.info("Read input transcriptome fasta file") with open(fa_output, "w") as fa_out: for rec in Fasta.Reader(fa_input): fa_out.write(">{}\n{}\n".format(rec.short_name, rec.seq)) logging.info("Index fasta file") with Faidx(fa_output) as fa_out: fa_out.build_index() except: logging.exception('Error while running get_transcriptome') raise
def test_order(self): order = ("gi|563317589|dbj|AB821309.1|", "gi|557361099|gb|KF435150.1|", "gi|557361097|gb|KF435149.1|", "gi|543583796|ref|NR_104216.1|", "gi|543583795|ref|NR_104215.1|", "gi|543583794|ref|NR_104212.1|", "gi|543583788|ref|NM_001282545.1|", "gi|543583786|ref|NM_001282543.1|", "gi|543583785|ref|NM_000465.3|", "gi|543583740|ref|NM_001282549.1|", "gi|543583738|ref|NM_001282548.1|", "gi|530384540|ref|XM_005249645.1|", "gi|530384538|ref|XM_005249644.1|", "gi|530384536|ref|XM_005249643.1|", "gi|530384534|ref|XM_005249642.1|", "gi|530373237|ref|XM_005265508.1|", "gi|530373235|ref|XM_005265507.1|", "gi|530364726|ref|XR_241081.1|", "gi|530364725|ref|XR_241080.1|", "gi|530364724|ref|XR_241079.1|") result = tuple(Faidx('data/genes.fasta').index.keys()) assert result == order
def test_fetch_keyerror(self): """ Fetch a key that does not exist """ faidx = Faidx("data/genes.fasta", strict_bounds=True) result = faidx.fetch("gi|joe|gb|KF435150.1|", 1, 10)
def __init__(self): self.fasta = os.path.join(path, 'data/genes.fasta') self.faidx = Faidx(self.fasta, default_seq='N')
def __init__(self): self.fasta = os.path.join(path, 'data/genes.fasta') self.faidx = Faidx(self.fasta, key_function=get_gene_name) self.genes = Fasta(self.fasta, key_function=get_gene_name)
def test_issue_74_end_faidx(self): f0 = Faidx('data/genes.fasta.gz', one_based_attributes=False) f1 = Faidx('data/genes.fasta.gz', one_based_attributes=True) end0 = f0.fetch('gi|557361099|gb|KF435150.1|', 1, 90).end end1 = f1.fetch('gi|557361099|gb|KF435150.1|', 1, 90).end assert end0 == end1
def test_key_function_by_fetch(self): faidx = Faidx('data/genes.fasta', split_char='|') expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA' result = faidx.fetch('KF435150.1', 100, 150) assert str(result) == expect
def test_fetch_reversed_coordinates(self): """ Fetch starting with a negative coordinate """ faidx = Faidx('data/genes.fasta.gz', strict_bounds=True) result = faidx.fetch('gi|557361099|gb|KF435150.1|', 50, 10)
def test_fetch_keyerror(self): """ Fetch a key that does not exist """ faidx = Faidx('data/genes.fasta.gz', strict_bounds=True) result = faidx.fetch('gi|joe|gb|KF435150.1|', 1, 10)
def test_rev(self): faidx = Faidx('data/genes.fasta.gz') expect = 'GA' result = faidx.fetch('gi|557361099|gb|KF435150.1|', 480, 481) assert str(-result) == expect, result
def test_fetch_reversed_coordinates(self): """ Fetch starting with a negative coordinate """ faidx = Faidx("data/genes.fasta", strict_bounds=True) result = faidx.fetch("gi|557361099|gb|KF435150.1|", 50, 10)
def test_fetch_negative(self): """ Fetch starting with a negative coordinate """ faidx = Faidx('data/genes.fasta.gz', strict_bounds=True) result = faidx.fetch('gi|557361099|gb|KF435150.1|', -10, 10)
def __init__(self): self.fasta = os.path.join(path, 'data/genes.fasta') self.faidx = Faidx(self.fasta) self.faidx_strict = Faidx(self.fasta, strict_bounds=True)
def test_fetch_middle(self): faidx = Faidx('data/genes.fasta.gz') expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA' result = faidx.fetch('gi|557361099|gb|KF435150.1|', 100, 150) assert str(result) == expect
def test_rev(self): faidx = Faidx("data/genes.fasta") expect = "GA" result = faidx.fetch("gi|557361099|gb|KF435150.1|", 480, 482) assert str(-result) == expect, result
def test_fetch_border(self): """ Fetch past the end of a gene entry """ faidx = Faidx("data/genes.fasta") expect = "TC" result = faidx.fetch("gi|557361099|gb|KF435150.1|", 480, 500) assert str(result) == expect
def test_fetch_end(self): faidx = Faidx("data/genes.fasta") expect = "TC" result = faidx.fetch("gi|557361099|gb|KF435150.1|", 480, 482) assert str(result) == expect
def test_fetch_middle(self): faidx = Faidx("data/genes.fasta") expect = "TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA" result = faidx.fetch("gi|557361099|gb|KF435150.1|", 100, 150) assert str(result) == expect
def test_key_function_by_fetch(self): faidx = Faidx('data/genes.fasta', key_function=get_gene_name) expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA' result = faidx.fetch('MDM4', 100, 150) assert str(result) == expect
def test_fetch_past_bounds(self): """ Fetch past the end of a gene entry """ faidx = Faidx("data/genes.fasta", strict_bounds=True) result = faidx.fetch("gi|557361099|gb|KF435150.1|", 480, 5000)
def test_issue_134_no_build_index(self): """ Ensure that index file is not built when build_index=False. See mdshw5/pyfaidx#134. """ faidx = Faidx('data/genes.fasta', build_index=False)
def test_samtools_compare(self): with open(self.samtools, 'r') as expect: expect = expect.read() index = Faidx.build_fai(self.fasta) result = ''.join(index) assert result == expect
def test_build(self): with open(self.expect, 'r') as fai: expect = fai.read() index = Faidx.build_fai(self.fasta) result = ''.join(index) assert result == expect
def test_order(self): index = Faidx.build_fai(self.fasta) genes = [x.split()[0] for x in index] assert genes == list(self.faidx.index.keys())
from collections import defaultdict from pyfaidx import Faidx import sys sys.path.append('/data/home/xutun/mySrc/modifyPoppyPaper') from getUse import dd,classF annotDiamondF = f'{dd}/isoseqDiamondAnnot.prot.diamond' transProtFa = f'{dd}/total.merge_corrected.faa' annotProtFa = f'{dd}/ref/poppy_v6.proteins.final_revised.fasta' toMergeGeneF = f'{dd}/toMergeGene.tab' classD = pd.read_table(classF,sep='\t') candidateTransSet = defaultdict(list) trans2annotDiamondSet = defaultdict(lambda:defaultdict(int)) transProtFaHandle = Faidx(transProtFa) annotProtFaHandle = Faidx(annotProtFa) resultTrans2protLen = defaultdict(int) def isMerge(gene): gene = gene.split('_') if len(gene)>=2: if len(gene[0])==11 and len(gene[1])==11: if 'PS' in gene[0] and 'PS' in gene[1]: return True return False def getTrans2gene(): trans2gene = defaultdict(int) for ind,row in classD.iterrows():
def test_fetch_end(self): faidx = Faidx('data/genes.fasta') expect = 'TC' result = faidx.fetch('gi|557361099|gb|KF435150.1|', 480, 481) assert str(result) == expect