def _genbank_convert_fasta(in_handle, out_handle, alphabet=None): """Fast GenBank to FASTA (PRIVATE).""" #We don't need to parse the features... from Bio.GenBank.Scanner import GenBankScanner records = GenBankScanner().parse_records(in_handle, do_features=False) #For FASTA output we can ignore the alphabet too return SeqIO.write(records, out_handle, "fasta")
def GenBankCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein): """Breaks up a Genbank file into SeqRecord objects for each CDS feature. Every section from the LOCUS line to the terminating // can contain many CDS features. These are returned as with the stated amino acid translation sequence (if given). """ #This calls a generator function: return GenBankScanner(debug=0).parse_cds_features(handle, alphabet)
def GenBankIterator(handle): """Breaks up a Genbank file into SeqRecord objects. Every section from the LOCUS line to the terminating // becomes a single SeqRecord with associated annotation and features. Note that for genomes or chromosomes, there is typically only one record.""" #This calls a generator function: return GenBankScanner(debug=0).parse_records(handle)
def reformat_genbank_first_line(first_line_inp, test_line=True): str_out = '' error_name = None line_parts = first_line_inp.split() if not 'LOCUS' in line_parts[0]: error_name = 'Massive error: LOCUS not on first line' return str_out, error_name try: bp_index = line_parts.index('bp') except ValueError: error_name = 'Missing bp' return str_out, error_name name_slice = line_parts[1:bp_index - 1] name = '_'.join(name_slice) padding_data = ' '.join([x.lower() for x in line_parts[bp_index - 1:]]) total_len = 12 + len(name) + len(padding_data) if total_len >= 80: extra_space = ' ' else: extra_space = ' ' * (80 - total_len) str_out = 'LOCUS' + ' ' + name + extra_space + padding_data + '\n' print(str_out.split()) if test_line: consumer = _FeatureConsumer(use_fuzziness=1, feature_cleaner=FeatureValueCleaner()) try: GenBankScanner(debug=1)._feed_first_line(consumer, str_out) except Exception as err: error_name = err return str_out, error_name
def GenBankIterator(handle): """Breaks up a Genbank file into SeqRecord objects. Every section from the LOCUS line to the terminating // becomes a single SeqRecord with associated annotation and features. Note that for genomes or chromosomes, there is typically only one record. This gets called internally by Bio.SeqIO for the GenBank file format: >>> from Bio import SeqIO >>> for record in SeqIO.parse("GenBank/cor6_6.gb", "gb"): ... print(record.id) ... X55053.1 X62281.1 M81224.1 AJ237582.1 L31939.1 AF297471.1 Equivalently, >>> with open("GenBank/cor6_6.gb") as handle: ... for record in GenBankIterator(handle): ... print(record.id) ... X55053.1 X62281.1 M81224.1 AJ237582.1 L31939.1 AF297471.1 """ # This calls a generator function: return GenBankScanner(debug=0).parse_records(handle)