コード例 #1
0
ファイル: get.py プロジェクト: ko9ma7/newsCrawl
def crawlNews(oid, processNo, pushedNo, startTime):
    while True:
        try:
            _, _, newsRawDB = connectDB(host)
            metadataCollection = newsRawDB['metadata']
            try:
                startNo = metadataCollection.find_one({"oid": oid})['last']
            except:
                startNo = 1
            tmpDB = []
            cnt = 0
            pushedNo.value += startNo - 1
            log('Process oid=%03d started at aid=%d' % (oid, startNo),
                startTime, processNo, pushedNo.value)
            for i in range(startNo, 999999999):
                status, newsResponseText, summary = getRaw(oid, i)
                if not status:
                    continue
                tmpDB.append({
                    'body': newsResponseText,
                    'summary': summary,
                    'aid': i
                })
                cnt += 1
                if cnt >= chunk:
                    if len(tmpDB) > 0:
                        newsRawDB[str(oid)].insert_many(tmpDB)
                        pushedNo.value += len(tmpDB)
                    log(
                        'Pushed %03d objects to DB at oid=%03d for aid=%d' %
                        (len(tmpDB), oid, i), startTime, processNo,
                        pushedNo.value)
                    tmpDB = []
                    cnt = 0
                    try:
                        metadataCollection.delete_one({"oid": oid})
                        metadataCollection.insert_one({"oid": oid, "last": i})
                    except:
                        pass
        except:
            pass
コード例 #2
0
ファイル: main.py プロジェクト: ko9ma7/newsCrawl
    return {
        'newsId': aid,
        'title': newsTitle,
        'body': newsText,
        'summary': summaryText,
        'category': category,
        'publishTime': publishTime,
        'editedTime': editedTime
    }


if __name__ == '__main__':
    multiprocessing.freeze_support()
    oid = int(input())

    newsDB, categoryDB, __ = connectDB(host)
    metadataCollection = newsDB['metadata']
    try:
        i = metadataCollection.find_one({"oid": oid})['last']
    except:
        i = 1
    while True:
        with multiprocessing.Pool(processes=processNo) as pool:
            newsList = list(
                filter(
                    partial(is_not, None),
                    tqdm(pool.imap_unordered(
                        getNews,
                        [(oid, x) for x in range(i, i + processNo * batch)]),
                         total=processNo * batch,
                         desc="Batch %d - %d" %
コード例 #3
0
    news = []
    if not status:
        return False,
    newsText = ""
    newsSoup = BeautifulSoup(html, 'html.parser')
    els = newsSoup.select('div.fcItem_top.clearfix')
    for i in els:
        try:
            body = i.select('a')[0].text
            factJson = demjson.decode(
                i.select('script')[0].text.strip()[14:-2].strip())
            score = np.mean(list(factJson['score'].values()))
            if score > 0:
                news.append({'body': body, 'score': score})
        except:
            pass

    return news


newsDB, *_ = connectDB(host)
page = 1
tot = 0
while True:
    li = crawlNews(page)
    if len(li) > 0:
        newsDB['snu'].insert_many(li)
    tot += len(li)
    print('Pushed %d objects' % tot)
    page += 1
コード例 #4
0
ファイル: process.py プロジェクト: ko9ma7/newsCrawl
def parseNews(oid, processNo, parsedNo, startTime):
    while 1:
        try:
            log('Process oid=%03d started.' % oid, 0, 0, 0)
            newsDB, categoryDB, newsRawDB = connectDB(host)
            while 1:
                li = list(newsRawDB[str(oid)].find().limit(chunk))
                if len(li) == 0:
                    return
                log('Got %d Data from DB at oid=%03d' % (len(li), oid),
                    startTime, processNo, parsedNo.value)
                removeLi = []
                processedNews = []
                categoryDict = dict()
                for news in li:
                    try:
                        removeLi.append({'_id': news['_id']})
                        aid, body, summary = news['aid'], news['body'], news[
                            'summary']
                        summarySoup = BeautifulSoup(summary['summary'],
                                                    'html.parser')
                        summaryText = summarySoup.get_text()
                        newsText = ""
                        newsSoup = BeautifulSoup(body, 'html.parser')
                        bodyEl = newsSoup.find(id="articleBodyContents")
                        for i in bodyEl:
                            if type(i) is NavigableString:
                                newsText += i
                            elif type(i) is Comment:
                                pass
                            else:
                                if i.name == 'br':
                                    newsText += '\n'
                                if i.get('data-type') == 'ore':
                                    newsText += i.get_text()

                        newsText = newsText.replace('\n\n', '\n')
                        newsText = newsText.replace('\n', ' ')
                        newsText = newsText.replace('  ', ' ')
                        newsText = newsText.strip().decode(
                            'utf-8', 'ignore').encode("utf-8")

                        newsTitle = newsSoup.find(
                            id="articleTitle").get_text().strip()

                        category = []
                        for i in newsSoup.find_all(
                                "em", {"class": "guide_categorization_item"}):
                            category.append(sectionName[i.get_text()])
                            if sectionName[i.get_text()] not in categoryDict:
                                categoryDict[sectionName[i.get_text()]] = []
                            categoryDict[sectionName[i.get_text()]].append({
                                'oid':
                                oid,
                                'aid':
                                aid
                            })

                        publishTime = strToDate(
                            newsSoup.find_all("span",
                                              {"class": "t11"})[0].get_text())
                        if len(newsSoup.find_all("span",
                                                 {"class": "t11"})) == 2:
                            editedTime = strToDate(
                                newsSoup.find_all(
                                    "span", {"class": "t11"})[1].get_text())
                        else:
                            editedTime = strToDate(
                                newsSoup.find_all(
                                    "span", {"class": "t11"})[0].get_text())

                        processedNews.append({
                            'newsId': aid,
                            'title': newsTitle,
                            'body': newsText,
                            'summary': summaryText,
                            'category': category,
                            'publishTime': publishTime,
                            'editedTime': editedTime
                        })
                    except:
                        pass
                for section, data in categoryDict.items():
                    categoryDB[section].insert_many(data)
                if len(processedNews) > 0:
                    newsDB[str(oid)].insert_many(processedNews)
                    parsedNo.value += len(processedNews)
                log(
                    'Parsed %03d objects in DB at oid=%03d' %
                    (len(processedNews), oid), startTime, processNo,
                    parsedNo.value)
                for remove in removeLi:
                    newsRawDB[str(oid)].delete_one(remove)
                log('Dropped %03d objects in RAW at oid=%03d' % (chunk, oid),
                    startTime, processNo, parsedNo.value)
        except:
            pass