def getTexts(self, id_list): ''' yield (id, text) pairs for text with specified ids ''' self._initDB() # create list of ids for sql query ids = set(i for i in id_list) ids.intersection_update(self.idSet) if len(ids) == 0: return idStr = '(' + ','.join([str(i) for i in ids]) + ')' query = self.query_template % ( 'id, text, feedtitle, datesaved, datepublished, url', self.__idSet()) query = query + (' AND id IN %s' % idStr) session = self._session for id, txt, title, datesav, datepub, url in \ session.query('id','text','feedtitle','datesaved', 'datepublished', 'url').\ execution_options(stream_result=True).from_statement(text(query)).all() : txt = Text(id, txt) txt.title = title txt.date = datesav txt.url = url txt.datesaved = datesav txt.datepublished = datepub yield txt session.close()
def __iter__(self): '''Open a session to the database and iterate over article texts. ''' self._initDB() session = self._session_maker() for id, txt, title in session.query("id","text","feedtitle").\ execution_options(stream_result=True).\ from_statement(text("SELECT id, text, feedtitle FROM feedarticle")).all() : txt = Text(id, txt, title=title) yield txt session.close()
def getTexts(self, id_list): self._initDB() "yield (id, text) pairs for text with specified ids" # create list of ids for sql query if len(id_list) == 0: return id_str = "(" for id in id_list: id_str += (str(id) + ',') id_str = id_str[:-1] id_str += ')' query = "SELECT id, text, feedtitle FROM feedarticle WHERE id IN %s" % id_str session = self._session #self._session_maker() for id, txt, title in session.query("id","text","feedtitle").\ execution_options(stream_result=True).from_statement(text(query)).all() : txt = Text(id, txt, title=title) yield txt
def __iter__(self): " opens a session to the database and iterates over all article texts " self._initDB() session = self._session_maker() query = self.__queryTemplate(True) \ % ('id, text, feedtitle, datesaved, datepublished, url', self.__idSet()) #print query for id, txt, title, datesav, datepub, url in session.query('id','text','feedtitle', 'datesaved', 'datepublished', 'url').\ execution_options(stream_result=True).from_statement(text(query)).all() : txt = Text(id, txt) txt.title = title txt.date = datesav txt.url = url txt.datesaved = datesav txt.datepublished = datepub yield txt session.close()
def getTexts(self, id_list): self._initDB() 'yield (id, text) pairs for text with specified ids' # create list of ids for sql query if len(id_list) == 0: return id_str = '(' + ','.join([str(i) for i in id_list]) + ')' query = self.__queryTemplate() \ % ('id, text, feedtitle, datesaved, datepublished, url', self.__dateCondition(), self.__feedSet(), 'AND id IN %s' % id_str) session = self._session for id, txt, title, datesav, datepub, url in \ session.query('id','text','feedtitle','datesaved', 'datepublished', 'url').\ execution_options(stream_result=True).from_statement(text(query)).all() : txt = Text(id, txt, title=title, date=datesav, url=url, datesaved=datesav, datepublished=datepub) yield txt session.close()
def parseLine(line): ''' Parse a string line and return Text object 'text' property is mandatory and must be last :param line: property1=value1, property2=value2, ... :return: Text object with properties and their values matching parsed properties. ''' propertyRe = re.compile(r'\s*([a-zA-Z]\w*)\s*=') valueRe = re.compile(r'([^,]?(\\,)?)+,(?!,)') props = {} pos = 0 id_ = None while pos < len(line): # print pos, '[%s]'%line[pos] pmatch = propertyRe.match(line, pos) if pmatch is not None: pos = pmatch.end(0) name = line[pmatch.start(1):pmatch.end(1)] # print 'full match: [%s]' % line[pmatch.start(0): pmatch.end(0)] if name == 'text': text = line[pmatch.end(0):] return Text(id_, text, **props) vmatch = valueRe.match(line, pos) if vmatch is None: raise Exception('property %s does not have value' % name) else: # pos = vmatch.end(0)+1 # skip the comma pos = vmatch.end(0) value = processValue(line[vmatch.start(0):vmatch.end(0) - 1]) # print name, value # process propery, value pair if name == 'id': id_ = value else: props[name] = value else: break raise Exception('there must be a "text" property')
from pytopia.corpus.Text import Text from pytopia.corpus.text.TextPerLineCorpus import TextPerLineCorpus from pytopia.corpus.text.TextCorpus import TextCorpus #TODO move hard line syntax cases to parseLine tests #TODO and fix multiple commas case corpusData = { 'rawText': ur''' id = 0, att = abcd, text = some, text id = 1, att = a\,b\,c\,a , text = text=now, ''', 'texts': [ Text('0', u' some, text', att='abcd'), Text('1', u' text=now,', att=u'a,b,c,a') ] } def assertTextEquality(txt1, txt2): assert txt1.id == txt2.id assert txt1.text == txt2.text #TODO assert equaility for other attributes def singleCorpusCheck(cClass, params, texts): corpus = cClass(**params) for i, txto in enumerate(corpus): ctxt = texts[i] assertTextEquality(txto, ctxt)