Example #1
0
def _parse_single_genbank(chunks):
    metadata = {}
    interval_metadata = None
    sequence = ''
    # each section starts with a HEADER without indent.
    section_splitter = _yield_section(lambda x: not x[0].isspace(),
                                      strip=False)
    for section in section_splitter(chunks):
        header = section[0].split(None, 1)[0]
        parser = _PARSER_TABLE.get(header, _parse_section_default)

        if header == 'FEATURES':
            # This requires 'LOCUS' line parsed before 'FEATURES', which should
            # be true and is implicitly checked by the sniffer.
            parser = partial(parser, length=metadata['LOCUS']['size'])

        parsed = parser(section)

        # reference can appear multiple times
        if header == 'REFERENCE':
            if header in metadata:
                metadata[header].append(parsed)
            else:
                metadata[header] = [parsed]
        elif header == 'ORIGIN':
            sequence = parsed
        elif header == 'FEATURES':
            interval_metadata = parsed
        else:
            metadata[header] = parsed
    return sequence, metadata, interval_metadata
Example #2
0
def _parse_single_genbank(chunks):
    metadata = {}
    interval_metadata = None
    sequence = ''
    # each section starts with a HEADER without indent.
    section_splitter = _yield_section(
        lambda x: not x[0].isspace(), strip=False)
    for section in section_splitter(chunks):
        header = section[0].split(None, 1)[0]
        parser = _PARSER_TABLE.get(
            header, _parse_section_default)

        if header == 'FEATURES':
            # This requires 'LOCUS' line parsed before 'FEATURES', which should
            # be true and is implicitly checked by the sniffer.
            parser = partial(
                parser, length=metadata['LOCUS']['size'])

        parsed = parser(section)

        # reference can appear multiple times
        if header == 'REFERENCE':
            if header in metadata:
                metadata[header].append(parsed)
            else:
                metadata[header] = [parsed]
        elif header == 'ORIGIN':
            sequence = parsed
        elif header == 'FEATURES':
            interval_metadata = parsed
        else:
            metadata[header] = parsed
    return sequence, metadata, interval_metadata
Example #3
0
def _parse_reference(lines):
    '''Parse single REFERENCE field.
    '''
    res = {}
    # magic number 11: the non keyworded lines in REFERENCE
    # are at least indented with 11 spaces.
    feature_indent = ' ' * 11
    section_splitter = _yield_section(
        lambda x: not x.startswith(feature_indent),
        skip_blanks=True, strip=False)
    for section in section_splitter(lines):
        label, data = _parse_section_default(
            section, join_delimiter=' ', return_label=True)
        res[label] = data
    return res
Example #4
0
def _parse_source(lines):
    '''Parse SOURCE field.
    '''
    res = {}
    # magic number 11: the non keyworded lines in SOURCE
    # are at least indented with 11 spaces.
    feature_indent = ' ' * 11
    section_splitter = _yield_section(
        lambda x: not x.startswith(feature_indent),
        skip_blanks=True, strip=False)
    # SOURCE line is not informative; skip it
    _, organism = list(section_splitter(lines))

    res['ORGANISM'] = organism[0].split(None, 1)[1].strip()
    res['taxonomy'] = ' '.join([i.strip() for i in organism[1:]])
    return res