def _parse_single_genbank(chunks): metadata = {} interval_metadata = None sequence = '' # each section starts with a HEADER without indent. section_splitter = _yield_section(lambda x: not x[0].isspace(), strip=False) for section in section_splitter(chunks): header = section[0].split(None, 1)[0] parser = _PARSER_TABLE.get(header, _parse_section_default) if header == 'FEATURES': # This requires 'LOCUS' line parsed before 'FEATURES', which should # be true and is implicitly checked by the sniffer. parser = partial(parser, length=metadata['LOCUS']['size']) parsed = parser(section) # reference can appear multiple times if header == 'REFERENCE': if header in metadata: metadata[header].append(parsed) else: metadata[header] = [parsed] elif header == 'ORIGIN': sequence = parsed elif header == 'FEATURES': interval_metadata = parsed else: metadata[header] = parsed return sequence, metadata, interval_metadata
def _parse_single_genbank(chunks): metadata = {} interval_metadata = None sequence = '' # each section starts with a HEADER without indent. section_splitter = _yield_section( lambda x: not x[0].isspace(), strip=False) for section in section_splitter(chunks): header = section[0].split(None, 1)[0] parser = _PARSER_TABLE.get( header, _parse_section_default) if header == 'FEATURES': # This requires 'LOCUS' line parsed before 'FEATURES', which should # be true and is implicitly checked by the sniffer. parser = partial( parser, length=metadata['LOCUS']['size']) parsed = parser(section) # reference can appear multiple times if header == 'REFERENCE': if header in metadata: metadata[header].append(parsed) else: metadata[header] = [parsed] elif header == 'ORIGIN': sequence = parsed elif header == 'FEATURES': interval_metadata = parsed else: metadata[header] = parsed return sequence, metadata, interval_metadata
def _parse_reference(lines): '''Parse single REFERENCE field. ''' res = {} # magic number 11: the non keyworded lines in REFERENCE # are at least indented with 11 spaces. feature_indent = ' ' * 11 section_splitter = _yield_section( lambda x: not x.startswith(feature_indent), skip_blanks=True, strip=False) for section in section_splitter(lines): label, data = _parse_section_default( section, join_delimiter=' ', return_label=True) res[label] = data return res
def _parse_source(lines): '''Parse SOURCE field. ''' res = {} # magic number 11: the non keyworded lines in SOURCE # are at least indented with 11 spaces. feature_indent = ' ' * 11 section_splitter = _yield_section( lambda x: not x.startswith(feature_indent), skip_blanks=True, strip=False) # SOURCE line is not informative; skip it _, organism = list(section_splitter(lines)) res['ORGANISM'] = organism[0].split(None, 1)[1].strip() res['taxonomy'] = ' '.join([i.strip() for i in organism[1:]]) return res