コード例 #1
0
 def getTexts(self, id_list):
     '''
     yield (id, text) pairs for text with specified ids
     '''
     self._initDB()
     # create list of ids for sql query
     ids = set(i for i in id_list)
     ids.intersection_update(self.idSet)
     if len(ids) == 0: return
     idStr = '(' + ','.join([str(i) for i in ids]) + ')'
     query = self.query_template % (
         'id, text, feedtitle, datesaved, datepublished, url',
         self.__idSet())
     query = query + (' AND id IN %s' % idStr)
     session = self._session
     for id, txt, title, datesav, datepub, url in \
             session.query('id','text','feedtitle','datesaved', 'datepublished', 'url').\
                 execution_options(stream_result=True).from_statement(text(query)).all() :
         txt = Text(id, txt)
         txt.title = title
         txt.date = datesav
         txt.url = url
         txt.datesaved = datesav
         txt.datepublished = datepub
         yield txt
     session.close()
コード例 #2
0
 def __iter__(self):
     '''Open a session to the database and iterate over article texts. '''
     self._initDB()
     session = self._session_maker()
     for id, txt, title in session.query("id","text","feedtitle").\
                 execution_options(stream_result=True).\
                 from_statement(text("SELECT id, text, feedtitle FROM feedarticle")).all() :
         txt = Text(id, txt, title=title)
         yield txt
     session.close()
コード例 #3
0
 def getTexts(self, id_list):
     self._initDB()
     "yield (id, text) pairs for text with specified ids"
     # create list of ids for sql query
     if len(id_list) == 0: return
     id_str = "("
     for id in id_list:
         id_str += (str(id) + ',')
     id_str = id_str[:-1]
     id_str += ')'
     query = "SELECT id, text, feedtitle FROM feedarticle WHERE id IN %s" % id_str
     session = self._session  #self._session_maker()
     for id, txt, title in session.query("id","text","feedtitle").\
                 execution_options(stream_result=True).from_statement(text(query)).all() :
         txt = Text(id, txt, title=title)
         yield txt
コード例 #4
0
 def __iter__(self):
     " opens a session to the database and iterates over all article texts "
     self._initDB()
     session = self._session_maker()
     query = self.__queryTemplate(True) \
             % ('id, text, feedtitle, datesaved, datepublished, url', self.__idSet())
     #print query
     for id, txt, title, datesav, datepub, url in session.query('id','text','feedtitle',
                                                    'datesaved', 'datepublished', 'url').\
                 execution_options(stream_result=True).from_statement(text(query)).all() :
         txt = Text(id, txt)
         txt.title = title
         txt.date = datesav
         txt.url = url
         txt.datesaved = datesav
         txt.datepublished = datepub
         yield txt
     session.close()
コード例 #5
0
 def getTexts(self, id_list):
     self._initDB()
     'yield (id, text) pairs for text with specified ids'
     # create list of ids for sql query
     if len(id_list) == 0: return
     id_str = '(' + ','.join([str(i) for i in id_list]) + ')'
     query = self.__queryTemplate() \
             % ('id, text, feedtitle, datesaved, datepublished, url',
                self.__dateCondition(), self.__feedSet(), 'AND id IN %s' % id_str)
     session = self._session
     for id, txt, title, datesav, datepub, url in \
             session.query('id','text','feedtitle','datesaved', 'datepublished', 'url').\
             execution_options(stream_result=True).from_statement(text(query)).all() :
         txt = Text(id,
                    txt,
                    title=title,
                    date=datesav,
                    url=url,
                    datesaved=datesav,
                    datepublished=datepub)
         yield txt
     session.close()
コード例 #6
0
def parseLine(line):
    '''
    Parse a string line and return Text object
    'text' property is mandatory and must be last
    :param line: property1=value1, property2=value2, ...
    :return: Text object with properties and their values matching parsed properties.
    '''
    propertyRe = re.compile(r'\s*([a-zA-Z]\w*)\s*=')
    valueRe = re.compile(r'([^,]?(\\,)?)+,(?!,)')
    props = {}
    pos = 0
    id_ = None
    while pos < len(line):
        # print pos, '[%s]'%line[pos]
        pmatch = propertyRe.match(line, pos)
        if pmatch is not None:
            pos = pmatch.end(0)
            name = line[pmatch.start(1):pmatch.end(1)]
            # print 'full match: [%s]' % line[pmatch.start(0): pmatch.end(0)]
            if name == 'text':
                text = line[pmatch.end(0):]
                return Text(id_, text, **props)
            vmatch = valueRe.match(line, pos)
            if vmatch is None:
                raise Exception('property %s does not have value' % name)
            else:
                # pos = vmatch.end(0)+1 # skip the comma
                pos = vmatch.end(0)
                value = processValue(line[vmatch.start(0):vmatch.end(0) - 1])
                # print name, value
            # process propery, value pair
            if name == 'id': id_ = value
            else: props[name] = value
        else:
            break
    raise Exception('there must be a "text" property')
コード例 #7
0
from pytopia.corpus.Text import Text
from pytopia.corpus.text.TextPerLineCorpus import TextPerLineCorpus
from pytopia.corpus.text.TextCorpus import TextCorpus

#TODO move hard line syntax cases to parseLine tests
#TODO  and fix multiple commas case
corpusData = {
    'rawText':
    ur'''
    id = 0, att = abcd, text = some, text
    id = 1, att = a\,b\,c\,a , text = text=now,
    ''',
    'texts': [
        Text('0', u' some, text', att='abcd'),
        Text('1', u' text=now,', att=u'a,b,c,a')
    ]
}


def assertTextEquality(txt1, txt2):
    assert txt1.id == txt2.id
    assert txt1.text == txt2.text
    #TODO assert equaility for other attributes


def singleCorpusCheck(cClass, params, texts):
    corpus = cClass(**params)
    for i, txto in enumerate(corpus):
        ctxt = texts[i]
        assertTextEquality(txto, ctxt)