Beispiel #1
0
def getLastScrappedDocId():
    lastId = DBSession.query(Setting.value).filter(Setting.key == u'lastFactorizedDocId').all()
    if len(lastId) == 0:
        DBSession.add(Setting(key = u'lastFactorizedDocId', value = u'0'))
        transaction.commit()
        return 0
    else:
        return int(lastId[0][0])
Beispiel #2
0
def getLastScrappedDocId():
    lastId = DBSession.query(
        Setting.value).filter(Setting.key == u'lastFactorizedDocId').all()
    if len(lastId) == 0:
        DBSession.add(Setting(key=u'lastFactorizedDocId', value=u'0'))
        transaction.commit()
        return 0
    else:
        return int(lastId[0][0])
Beispiel #3
0
def parse():
    lastId = getLastScrappedDocId()
    factStorage = createFactStorage()
    backFacts = backFactLinksProcessing()
    newLastId = lastId
    for newLastId, url, rawText, title, data in DBSession.query(
            ScrapedData.pk, ScrapedData.url, ScrapedData.preview,
            ScrapedData.title,
            ScrapedData.data).filter(ScrapedData.pk > lastId).order_by(
                ScrapedData.pk):  #.limit(10):
        facts = extractAll(rawText, factStorage, backFacts)
        facts.update(extractAll(title, factStorage, backFacts))
        #print url, facts

        data = json.decode(data)
        salaryFact = []
        if 'salary' in data:
            salary = extractSalary(data['salary'])
            if salary is not None:
                if len(salary) == 2:
                    try:
                        salaryCount = int(re.sub('\s*', '', salary[0], 0,
                                                 re.U))
                    except ValueError, x:
                        salaryCount = 0
                    if salary[1].strip() == '$' or \
                            salary[1].strip().lower() == 'usd':
                        salaryCount *= 8
                    salaryFact.append(('salary_from', salaryCount))
                elif len(salary) == 3:
                    try:
                        salaryFrom = int(re.sub('\s*', '', salary[0], 0, re.U))
                        salaryTo = int(re.sub('\s*', '', salary[1], 0, re.U))
                    except ValueError:
                        salaryFrom = 0
                        salaryTo = 0
                    if salary[2].strip() == '$' or\
                       salary[2].strip().lower() == 'usd':
                        salaryFrom *= 8
                        salaryTo *= 8
                    salaryFact.append(('salary_from', salaryFrom))
                    salaryFact.append(('salary_to', salaryTo))

        saveFactsToDatabase(newLastId, facts, salaryFact)
Beispiel #4
0
def parse():
    lastId = getLastScrappedDocId()
    factStorage = createFactStorage()
    backFacts = backFactLinksProcessing()
    newLastId = lastId
    for newLastId, url, rawText, title, data in DBSession.query(ScrapedData.pk, ScrapedData.url, ScrapedData.preview, ScrapedData.title, ScrapedData.data).filter(ScrapedData.pk > lastId).order_by(ScrapedData.pk):#.limit(10):
        facts = extractAll(rawText, factStorage, backFacts)
        facts.update(extractAll(title, factStorage, backFacts))
        #print url, facts

        data = json.decode(data)
        salaryFact = []
        if 'salary' in data:
            salary = extractSalary(data['salary'])
            if salary is not None:
                if len(salary) == 2:
                    try:
                        salaryCount = int(re.sub('\s*', '', salary[0], 0, re.U))
                    except ValueError, x:
                        salaryCount = 0
                    if salary[1].strip() == '$' or \
                            salary[1].strip().lower() == 'usd':
                        salaryCount *= 8
                    salaryFact.append(('salary_from', salaryCount))
                elif len(salary) == 3:
                    try:
                        salaryFrom = int(re.sub('\s*', '', salary[0], 0, re.U))
                        salaryTo = int(re.sub('\s*', '', salary[1], 0, re.U))
                    except ValueError:
                        salaryFrom = 0
                        salaryTo = 0
                    if salary[2].strip() == '$' or\
                       salary[2].strip().lower() == 'usd':
                        salaryFrom *= 8
                        salaryTo *= 8
                    salaryFact.append(('salary_from', salaryFrom))
                    salaryFact.append(('salary_to', salaryTo))

        saveFactsToDatabase(newLastId, facts, salaryFact)
Beispiel #5
0
def putLastScrappedDocId(lastId):
    DBSession.query(
        Setting.value).filter(Setting.key == u'lastFactorizedDocId').update(
            {u'value': unicode(lastId)})
    transaction.commit()
Beispiel #6
0
def putLastScrappedDocId(lastId):
    DBSession.query(Setting.value).filter(Setting.key == u'lastFactorizedDocId').update({u'value': unicode(lastId)})
    transaction.commit()
Beispiel #7
0
def loadFactLinksFromDatabase():
    return DBSession.query(FactLinks).filter(Fact.type == FACT_TYPE_BOOL)
Beispiel #8
0
def loadFactsFromDatabase():
    return DBSession.query(Fact).all()
Beispiel #9
0
def loadFactLinksFromDatabase():
    return DBSession.query(FactLinks).filter(Fact.type == FACT_TYPE_BOOL)
Beispiel #10
0
def loadFactsFromDatabase():
    return DBSession.query(Fact).all()