コード例 #1
0
    def test_parsers_error(self):
        """DelimitedRecordFinder should raise RecordError if trailing data"""
        good =  [   '  \t   abc  \n',
                    '\t   def\n',
                    '// \t\n',
                    '\t\n',
                    '\t efg \n',
                    '\t\t//\n',
                ]
        blank = ['', '   ', '\t    \t\n\n']
        bad = ['abc']

        result = [['abc', 'def', '//'], ['efg','//']]
        r = DelimitedRecordFinder('//')
        
        self.assertEqual(list(r(good)), result)
        self.assertEqual(list(r(good+blank)), result)
        try:
            list(r(good+bad))
        except RecordError:
            pass
        else:
            raise AssertionError, "Parser failed to raise error on bad data"

        r = DelimitedRecordFinder('//', strict=False)
        self.assertEqual(list(r(good+bad)), result + [['abc']])
コード例 #2
0
 def test_parsers(self):
     """DelimitedRecordFinder should split records into lines correctly"""
     lines = 'abc\ndef\n//\nefg\n//'.split()
     self.assertEqual(list(DelimitedRecordFinder('//')(lines)), \
         [['abc', 'def', '//'], ['efg','//']])
     self.assertEqual(list(DelimitedRecordFinder('//', keep_delimiter=False)
         (lines)), \
         [['abc', 'def'], ['efg']])
コード例 #3
0
    def __call__(self, infile):
        """ Parse AAIndex file into dict of AAIndex objects with ID as key

            infile = file to parse as file object or list of lines

            Usage:
                aa1p = AAIndex1Parser()
                aaIndex1Objects = aa1p('data/AAIndex1')

                aa2p = AAIndex2Parser()
                aaIndex2Objects = aa2p('data/AAIndex2')
        """
        
        result = {}

        # Break down the file into records delimited by '//' and then
        # parse each record into AAIndexRecord objects which will be stored
        # in a dict keyed by the records unique ID string
        AAIndexRecordFinder = DelimitedRecordFinder('//', constructor=rstrip)
        # parser is a generator of AAIndexRecords from file
        parser = AAIndexRecordFinder(infile)       

        for r in parser:
            new_record = self._parse_record(r)
            if new_record:
                yield new_record
コード例 #4
0
ファイル: greengenes.py プロジェクト: mikerobeson/pycogent
def MinimalGreengenesParser(lines,
                            LineDelim="=",
                            RecStart="BEGIN",
                            RecEnd="END"):
    """Parses raw Greengeens 16S rRNA Gene records
   
    lines  :  open records file
    LineDelim  :  individual line delimiter, eg foo=bar
    RecStart  :  start identifier for a record
    RecEnd  :  end identifier for a record
    """
    line_parser = DefaultDelimitedSplitter(delimiter=LineDelim)

    # parse what the ending record looks like so it can match after being split
    RecordDelim = line_parser(RecEnd)

    # make sure to ignore the starting record
    ignore = make_ignore_f(RecStart)

    parser = DelimitedRecordFinder(RecordDelim,
                                   constructor=line_parser,
                                   keep_delimiter=False,
                                   ignore=ignore)

    for record in parser(lines):
        yield GenericRecord(record)
コード例 #5
0
 def test_parsers_ignore(self):
     """DelimitedRecordFinder should skip lines to ignore."""
     def never(line):
         return False
     
     def ignore_labels(line):
         return (not line) or line.isspace() or line.startswith('#')
     
     lines = ['>abc','\n','1', '$$', '>def','#ignore','2', '$$']
     self.assertEqual(list(DelimitedRecordFinder('$$')(lines)), 
         [['>abc', '1', '$$'],['>def','#ignore','2', '$$']])
     self.assertEqual(list(DelimitedRecordFinder('$$', 
         ignore=never)(lines)),
         [['>abc', '', '1', '$$'],['>def','#ignore','2','$$']])
     self.assertEqual(list(DelimitedRecordFinder('$$', 
         ignore=ignore_labels)(lines)),
         [['>abc','1','$$'],['>def','2','$$']])
コード例 #6
0
ファイル: rfam.py プロジェクト: wangdi2014/for_qiime_scripts
def is_empty_or_html(line):
    """Return True for HTML line and empty (or whitespace only) line.

    line -- string

    The Rfam adaptor that retrieves records inlcudes two HTML tags in
    the record. These lines need to be ignored in addition to empty lines. 
    """
    if line.startswith('<pre') or line.startswith('</pre'):
        return True
    return (not line) or line.isspace()


Sequence = BYTES.Sequence
RfamFinder = DelimitedRecordFinder('//', ignore=is_empty_or_html)


def load_from_clustal(data, seq_constructor=Sequence, strict=True):
    recs = [(name, seq_constructor(seq, )) for name, seq in\
        ClustalParser(data, strict)]
    lengths = [len(i[1]) for i in recs]
    if lengths and max(lengths) == min(lengths):
        return Alignment(recs, MolType=BYTES)
    else:
        return SequenceCollection(recs, MolType=BYTES)


#all fields concerning the references are translated to None, except for
# the MedLine ID, so that we can lookup the information if needed.
#RC = Reference comment
コード例 #7
0
all_chars = maketrans('', '')
dna_lc = 'utacgrywsmkbdhvn'
dna_lc_cmp = 'aatgcyrwskmvhdbn'
dna_trans = maketrans(dna_lc + dna_lc.upper(), dna_lc_cmp + dna_lc_cmp.upper())
rna_lc = 'utacgrywsmkbdhvn'
rna_lc_cmp = 'aaugcyrwskmvhdbn'
rna_trans = maketrans(rna_lc + rna_lc.upper(), rna_lc_cmp + rna_lc_cmp.upper())

locus_fields = [
    None, 'locus', 'length', None, 'mol_type', 'topology', 'db', 'date'
]
_locus_parser = FieldWrapper(locus_fields)

#need to turn off line stripping, because whitespace is significant
GbFinder = DelimitedRecordFinder('//', constructor=rstrip)


class PartialRecordError(Exception):
    pass


def parse_locus(line):
    """Parses a locus line, including conversion of Length to an int.
    
    WARNING: Gives incorrect results on legacy records that omit the topology. 
    All records spot-checked on 8/30/05 had been updated to include the topology
    even when prior versions omitted it.
    """
    result = _locus_parser(line)
    try:
コード例 #8
0
 def test_parsers_empty(self):
     """DelimitedRecordFinder should return empty list on empty lines"""
     self.assertEqual(list(DelimitedRecordFinder('//')(['  ','\n'])), [])
     self.assertEqual(list(DelimitedRecordFinder('//')([])), [])
コード例 #9
0
 def test_parsers_strip(self):
     """DelimitedRecordFinder should trim each line correctly"""
     lines = '  \t   abc  \n \t   def\n  // \t\n\t\t efg \n//'.split('\n')
     self.assertEqual(list(DelimitedRecordFinder('//')(lines)), \
         [['abc', 'def', '//'], ['efg','//']])
コード例 #10
0
                    for ix, best_hit in enumerate(best_hits):
                        new_val = cast_fun(hit[field])
                        old_val = cast_fun(best_hit[field])
                        if cmp_fun(new_val, old_val):
                            best_hits[ix] = hit
                            continue
            yield q, best_hits

    def filterByIteration(self, iteration=-1):
        """Returns copy of self containing only specified iteration.

        Negative indices count backwards."""

    #raise error if both field and f passed, uses same dict as filterByField

fastacmd_taxonomy_splitter = DelimitedRecordFinder(delimiter='', \
    ignore=never_ignore)
fasta_field_map = {
    'NCBI sequence id': 'seq_id',
    'NCBI taxonomy id': 'tax_id',
    'Common name': 'common_name',
    'Scientific name': 'scientific_name'
}


def FastacmdTaxonomyParser(lines):
    """Yields successive records from the results of fastacmd -T.

    Format is four lines separated by newline:
    NCBI sequence
    NCBI taxonomy
    Common name
コード例 #11
0
ファイル: ncbi.py プロジェクト: mikerobeson/pycogent
    return ELinkResultParser(link.read())

def get_between_tags(line):
    """"Returns portion of line between xml tags."""
    return line.split('>', 1)[1].rsplit('<', 1)[0]

def taxon_lineage_extractor(lines):
    """Extracts lineage from taxonomy record lines, not incl. species."""
    for line in lines:
        if '<Lineage>' in line:
            #expect line of form <Lineage>xxxx</Lineage> where xxxx semicolon-
            #delimited
            between_tags = line.split('>', 1)[1].rsplit('<', 1)[0]
            yield map(strip, between_tags.split(';'))

taxon_record_finder = DelimitedRecordFinder('</Taxon>', constructor=None, 
    strict=False)

def get_taxid_name_lineage(rec):
    """Returns taxon id, name, and lineage from single xml taxon record."""
    tax_tag = '  <TaxId>'
    name_tag = '  <ScientificName>'
    lineage_tag = '  <Lineage>'
    taxid = name = lineage = None
    for line in rec:
        if line.startswith(tax_tag):
            taxid = get_between_tags(line)
        elif line.startswith(name_tag):
            name = get_between_tags(line)
        elif line.startswith(lineage_tag):
            lineage = map(strip, get_between_tags(line).split(';'))
    return taxid, name, lineage
コード例 #12
0
from cogent.parse.record_finder import DelimitedRecordFinder
from cogent.parse.record import RecordError
from cogent.core.sequence import Sequence, RnaSequence
from cogent.core.info import Info
from cogent.core.alphabet import AlphabetError

__author__ = "Sandra Smit"
__copyright__ = "Copyright 2007-2012, The Cogent Project"
__credits__ = ["Sandra Smit", "Rob Knight"]
__license__ = "GPL"
__version__ = "1.5.3-dev"
__maintainer__ = "Sandra Smit"
__email__ = "*****@*****.**"
__status__ = "Development"

RdbFinder = DelimitedRecordFinder('//')

_field_names = {'acc':'rRNA',\
                'src':'Source',\
                'str':'Strain',\
                'ta1':'Taxonomy1',\
                'ta2':'Taxonomy2',\
                'ta3':'Taxonomy3',\
                'ta4':'Taxonomy4',\
                'chg':'Changes',\
                'rem':'Remarks',\
                'aut':'Authors',\
                'ttl':'Title',\
                'jou':'Journal',\
                'dat':'JournalYear',\
                'vol':'JournalVolume',\