def getLastScrappedDocId(): lastId = DBSession.query(Setting.value).filter(Setting.key == u'lastFactorizedDocId').all() if len(lastId) == 0: DBSession.add(Setting(key = u'lastFactorizedDocId', value = u'0')) transaction.commit() return 0 else: return int(lastId[0][0])
def getLastScrappedDocId(): lastId = DBSession.query( Setting.value).filter(Setting.key == u'lastFactorizedDocId').all() if len(lastId) == 0: DBSession.add(Setting(key=u'lastFactorizedDocId', value=u'0')) transaction.commit() return 0 else: return int(lastId[0][0])
def parse(): lastId = getLastScrappedDocId() factStorage = createFactStorage() backFacts = backFactLinksProcessing() newLastId = lastId for newLastId, url, rawText, title, data in DBSession.query( ScrapedData.pk, ScrapedData.url, ScrapedData.preview, ScrapedData.title, ScrapedData.data).filter(ScrapedData.pk > lastId).order_by( ScrapedData.pk): #.limit(10): facts = extractAll(rawText, factStorage, backFacts) facts.update(extractAll(title, factStorage, backFacts)) #print url, facts data = json.decode(data) salaryFact = [] if 'salary' in data: salary = extractSalary(data['salary']) if salary is not None: if len(salary) == 2: try: salaryCount = int(re.sub('\s*', '', salary[0], 0, re.U)) except ValueError, x: salaryCount = 0 if salary[1].strip() == '$' or \ salary[1].strip().lower() == 'usd': salaryCount *= 8 salaryFact.append(('salary_from', salaryCount)) elif len(salary) == 3: try: salaryFrom = int(re.sub('\s*', '', salary[0], 0, re.U)) salaryTo = int(re.sub('\s*', '', salary[1], 0, re.U)) except ValueError: salaryFrom = 0 salaryTo = 0 if salary[2].strip() == '$' or\ salary[2].strip().lower() == 'usd': salaryFrom *= 8 salaryTo *= 8 salaryFact.append(('salary_from', salaryFrom)) salaryFact.append(('salary_to', salaryTo)) saveFactsToDatabase(newLastId, facts, salaryFact)
def parse(): lastId = getLastScrappedDocId() factStorage = createFactStorage() backFacts = backFactLinksProcessing() newLastId = lastId for newLastId, url, rawText, title, data in DBSession.query(ScrapedData.pk, ScrapedData.url, ScrapedData.preview, ScrapedData.title, ScrapedData.data).filter(ScrapedData.pk > lastId).order_by(ScrapedData.pk):#.limit(10): facts = extractAll(rawText, factStorage, backFacts) facts.update(extractAll(title, factStorage, backFacts)) #print url, facts data = json.decode(data) salaryFact = [] if 'salary' in data: salary = extractSalary(data['salary']) if salary is not None: if len(salary) == 2: try: salaryCount = int(re.sub('\s*', '', salary[0], 0, re.U)) except ValueError, x: salaryCount = 0 if salary[1].strip() == '$' or \ salary[1].strip().lower() == 'usd': salaryCount *= 8 salaryFact.append(('salary_from', salaryCount)) elif len(salary) == 3: try: salaryFrom = int(re.sub('\s*', '', salary[0], 0, re.U)) salaryTo = int(re.sub('\s*', '', salary[1], 0, re.U)) except ValueError: salaryFrom = 0 salaryTo = 0 if salary[2].strip() == '$' or\ salary[2].strip().lower() == 'usd': salaryFrom *= 8 salaryTo *= 8 salaryFact.append(('salary_from', salaryFrom)) salaryFact.append(('salary_to', salaryTo)) saveFactsToDatabase(newLastId, facts, salaryFact)
def putLastScrappedDocId(lastId): DBSession.query( Setting.value).filter(Setting.key == u'lastFactorizedDocId').update( {u'value': unicode(lastId)}) transaction.commit()
def putLastScrappedDocId(lastId): DBSession.query(Setting.value).filter(Setting.key == u'lastFactorizedDocId').update({u'value': unicode(lastId)}) transaction.commit()
def loadFactLinksFromDatabase(): return DBSession.query(FactLinks).filter(Fact.type == FACT_TYPE_BOOL)
def loadFactsFromDatabase(): return DBSession.query(Fact).all()
def loadFactLinksFromDatabase(): return DBSession.query(FactLinks).filter(Fact.type == FACT_TYPE_BOOL)
def loadFactsFromDatabase(): return DBSession.query(Fact).all()