コード例 #1
0
def getData():
    tweetCriteria = got.manager.TweetCriteria().setUsername("CNN").setSince("2013-01-01").setUntil(
        "2017-01-01").setMaxTweets(15000)
    tweets = got.manager.TweetManager.getTweets(tweetCriteria)
    
    news_collection = database.news_collection()
    
    for tweet in tweets:
    
        urls = tweet.urls
    
        if 'twitter' in urls or not urls:
            continue
    
        if news_collection.find({'reference': urls}).count() > 0:
            print('Skip duplicated ' + urls)
            continue
    
        timestamp = int(time.mktime(time.strptime(tweet.formatted_date, '%a %b %d %H:%M:%S +0000 %Y')))
        document = {
            'id': tweet.id,
            'created_at': timestamp,
            'reference': tweet.urls
        }
        print('Insert ' + tweet.urls + '  created at ' + str(timestamp))
        news_collection.insert_one(document)
コード例 #2
0
ファイル: preprocess.py プロジェクト: huyqut/thesis-spider
def build_tagged():
    logger = thesis_logging.get_logger('preprocess')
    latest = 0
    count = 0
    index = 1

    news_collection = database.news_collection()
    duplicated_doc = {}
    while True:
        documents = news_collection.find({'created_at': {'$gte': latest}})
        if documents.count() == 0:
            break
        for doc in documents:
            count += 1
            try:
                latest = doc['created_at']
                if not doc.get('text'):
                    print('Ignore',
                          'Count ' + str(count), 'Id ' + str(doc['id']),
                          str(doc['created_at']), doc['reference'])
                    continue
                content = doc['text']
                if content not in duplicated_doc:
                    duplicated_doc[content] = True
                    index += 1
                    logger.info(nltk.word_tokenize(content.lower()))
                    yield TaggedDocument(words=nltk.word_tokenize(
                        content.lower()),
                                         tags=[index])

            except Exception as e:
                logger.error(doc['reference'] + ' : ' + str(e))
コード例 #3
0
def write_web_news():
    logger = thesis_logging.get_logger('preprocess')

    link_list = [
        'cnn.it', 'nyti.ms', 'nbcnews', 'apne.ws', 'reut.rs', 'wapo.st',
        'abcn.ws', 'nbcbay.com', 'bbc.in', 'huff.to', 'ti.me', 'cbsn.ws',
        'huffingtonpost.com', 'cnb.cx', 'cnnmon.ie', 'huffp.st', 'forbes.com',
        'telegraph.co', 'cnn.com', 'trib.al', 'express.co', 'gu.com',
        'bloom.bg', 'hill.cm', 'natgeo.com', 'pbs.org', 'washingtonpost',
        'news.sky.com'
    ]
    for source in link_list:
        # latest = ObjectId("59abbfedf296532f80d18a47")  # dyncorp
        # latest = ObjectId("59abc7e2f296532ad483f4b6")  # lds
        # latest = ObjectId("59acc20df296533c88dbaed6")  # tm
        latest = ObjectId("5942946efe43ad1da80b1a79")  # news
        index = 0
        path_file = './datasets/insensitive/news/' + source.replace('.',
                                                                    '_') + '_'
        train_collection = news_database.news_collection()
        duplicated_doc = {}
        while True:
            documents = train_collection.find({
                '_id': {
                    '$gt': latest
                },
                'reference': {
                    '$regex': '.*' + source + '.*'
                }
            })
            if documents.count() == 0:
                break
            for doc in documents:
                try:
                    latest = doc['_id']
                    if not doc.get('text'):
                        # print('Ignore', 'Count ' + str(count), 'Id ' + str(doc['id']), str(doc['created_at']),
                        #      doc['reference'])
                        continue
                    content = doc['text']
                    if len(content) < 1000:
                        # logger.info('Ignore small content, Count ' + str(count))
                        continue
                    title = doc['title']
                    if len(title) > 60:
                        title = title[0:60]
                    title = "".join(x for x in title if x.isalnum())
                    if content not in duplicated_doc:
                        duplicated_doc[content] = True
                        index += 1
                        # logger.info(nltk.word_tokenize(content.lower()))
                        with open(path_file + title + '.txt',
                                  'w',
                                  encoding="utf-8") as doc_file:
                            doc_file.write(doc['text'])

                except Exception as e:
                    logger.error(doc['reference'] + ' : ' + str(e))
        print(source, index)
コード例 #4
0
ファイル: Main.py プロジェクト: huyqut/thesis-spider
def mergeDB():
    news_collection = database.news_collection()
    train_collection = train_database.train_collection()
    latest = ObjectId("5942946efe43ad1da80b1a79")
    count = 0
    index = 1

    while True:
        documents = news_collection.find({'_id': {'$gt': latest}}).limit(100)
        if documents.count() == 0:
            break
        for doc in documents:
            count += 1
            try:
                latest = doc['_id']
                if not doc.get('text'):
                    print('Skip', doc['reference'])
                    continue

                if train_collection.find({
                        'reference': doc['reference']
                }).count() > 0:
                    print('Skip duplicated reference ' + doc['reference'])
                    continue
                if train_collection.find({'text': doc['text']}).count() > 0:
                    print('Skip duplicated text ' + doc['reference'])
                    continue

                document = {
                    'id': doc['id'],
                    'created_at': doc['created_at'],
                    'reference': doc['reference'],
                    'title': doc['title'],
                    'text': doc['text'],
                    'image': doc['image'],
                }
                print('Insert ' + doc['reference'] + '  created at ' +
                      str(doc['created_at']))
                train_collection.insert_one(document)

            except Exception as e:
                print(e)
コード例 #5
0
ファイル: train.py プロジェクト: huyqut/thesis-spider
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))

google_news_word2vec_model_location = 'data/GoogleNews-vectors-negative300.bin.gz'
enwiki_bin_location = 'training/metawiki-20170401-pages-articles.xml.bz2'
enwiki_txt_location = 'training/wiki-documents.txt'
doc2vec_model_location = 'model/doc2vec-model-temp-300.bin'
word2vec_model_location = 'model/word2vec-model.bin'
doc2vec_vectors_location = 'model/doc2vec-vectors.bin'
clustering_model_location = 'model/clustering_model.bin'
doc2vec_dimensions = 300
classifier_model_location = 'model/classifier-model.bin'

train_collection = train_database.train_collection()
news_collection = news_database.news_collection()

# Build the word2vec model from the corpus
# doc2vec.build_vocab(taggedDocuments)


def build_wiki_text():
    i = 0
    output = open(enwiki_txt_location, 'w+', encoding="utf-8")
    wiki = WikiCorpus(enwiki_bin_location, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        output.write(b' '.join(text).decode('utf-8') + '\n')
        i = i + 1
        if (i % 10000 == 0):
            logger.info("Saved " + str(i) + " articles")
コード例 #6
0
ファイル: spider.py プロジェクト: huyqut/thesis-spider
def locate_feeds(
    news_converter: NewsConverter,
    latest: int = 0,
):
    global crawler_finish
    logger = thesis_logging.get_logger('locator')
    news_collection = database.news_collection()

    class VectorConverter(Thread):
        def __init__(self, text):
            super().__init__()
            self.text = text
            self.vector = []

        def run(self):
            self.vector = news_converter.convert_doc_to_vector(
                self.text).tolist()

    class GeographyExtractor(Thread):
        def __init__(self, text):
            super().__init__()
            self.text = text
            self.places = []
            self.people = []
            self.organs = []

        def run(self):
            context = geograpy.get_place_context(text=self.text)
            self.places = context.places
            self.people = context.people
            self.organs = context.organs

    class PageParser(Thread):
        def __init__(self, tweet_id, url, collection):
            super().__init__()
            self.tweet_id = tweet_id
            self.url = url
            self.collection = collection

        def run(self):
            try:
                logger.info('Parse ' + self.url)
                article = Article(self.url)
                article.download()
                if article.download_exception_msg and "404" in article.download_exception_msg:
                    logger.error('404 not found, delete... ' + self.url)
                    news_collection.remove({"id": self.tweet_id})
                    return
                article.parse()
                ignore_list = [
                    "twitter.com", "youtube.com", "facebook.com",
                    "instagram.com"
                ]
                if any(x in article.canonical_link for x in ignore_list):
                    print('delete ' + article.canonical_link)
                    news_collection.remove({"id": self.tweet_id})
                    return
                logger.info('Title for ' + article.top_image + '  -  ' +
                            article.canonical_link + '\n' + article.title +
                            '\n\n')
                logger.info('Latest: ' + str(latest))
                vector_converter = VectorConverter(article.text)
                geography_extractor = GeographyExtractor(article.text)
                vector_converter.start()
                geography_extractor.start()
                geography_extractor.join()
                vector_converter.join()
                vector = vector_converter.vector
                news_collection.update_one({'id': self.tweet_id}, {
                    '$set': {
                        'places': geography_extractor.places,
                        'people': geography_extractor.people,
                        'organs': geography_extractor.organs,
                        'vector': vector,
                        'title': article.title,
                        'text': article.text,
                        'image': article.top_image
                    }
                })
                for place in geography_extractor.places:
                    self.collection.update_one({'place': place},
                                               {'$inc': {
                                                   'count': 1
                                               }},
                                               upsert=True)

            except Exception as e:
                logger.error(str(e))

    location_collection = database.location_collection()
    duplicate_urls = {}
    tasks = []
    while True:
        documents = news_collection.find({
            'created_at': {
                '$gte': latest
            }
        }).limit(100)
        logger.info('Found ' + str(documents.count()) + ' after ' +
                    str(latest))

        # Clean up remaining tasks
        if len(tasks) != 0:
            logger.info('Cleaning up remaining tasks')
            for task in tasks:
                task.join()
            tasks.clear()

        if documents.count() == 1:
            if crawler_finish:
                break
            logger.warn('Nap and back in 500 seconds')
            time.sleep(500)
            continue

        logger.info('Start Locating')
        index = 0

        for doc in documents:
            try:
                ref = doc['reference']
                latest = doc['created_at']
                image = doc.get('image')

                if image is not None:
                    logger.info('image skip')
                    continue
                if news_collection.find({'reference': ref}).count() > 1:
                    logger.info('delete duplicate ' + ref)
                    news_collection.remove({"id": doc['id']})
                    continue

                thread = PageParser(doc['id'], ref, location_collection)
                tasks.append(thread)
                thread.start()
                time.sleep(7)
                index += 1
                if index % 5 == 0:
                    for task in tasks:
                        task.join()
                    tasks.clear()

            except Exception as e:
                logger.error(doc['reference'] + ' : ' + str(e))
コード例 #7
0
ファイル: spider.py プロジェクト: huyqut/thesis-spider
def crawl_feeds(dev: TwitterDev, duration: int = 0):
    global crawler_finish
    logger = thesis_logging.get_logger('crawler')
    while True:
        try:
            if dev is None:
                logger.error('There is no Twitter developer account detected.')
                return
            news_collection = database.news_collection()
            logger.info('ok')
            user_id = dev.api.VerifyCredentials()
            logger.info('Twitter Auth: ' + str(user_id.AsJsonString()))
            friends = dev.api.GetFriendIDs(user_id, stringify_ids=True)
            logger.info('Friends: ' + str(friends))
            logger.info('Start crawling')
            start = int(round(time.time()) * 1000)
            link_list = [
                'cnn.it', 'nyti.ms', 'nbcnews', 'apne.ws', 'reut.rs',
                'wapo.st', 'abcn.ws', 'nbcbay.com', 'bbc.in', 'huff.to',
                'ti.me', 'cbsn.ws', 'huffingtonpost.com', 'cnb.cx',
                'cnnmon.ie', 'huffp.st', 'forbes.com', 'telegraph.co',
                'cnn.com', 'trib.al', 'express.co', 'gu.com', 'bloom.bg',
                'hill.cm', 'natgeo.com', 'pbs.org', 'washingtonpost',
                'news.sky.com'
            ]
            ignore_list = [
                'bit.ly',
                'twitter',
                'tinyurl',
                'goo.gl',
                'facebook.com',
            ]
            dupliate_urls = {}
            for status in dev.api.GetStreamFilter(follow=friends):
                urls = status['entities']['urls']
                if len(urls) == 0:
                    continue
                url = urls[0]['expanded_url']

                if url is None:
                    continue

                if not any(x in url for x in link_list):
                    logger.info('Skip link ' + url)
                    continue

                if news_collection.find({'reference': url}).count() > 0:
                    logger.info('Skip duplicated ' + url)
                    continue

                timestamp = int(
                    time.mktime(
                        time.strptime(status['created_at'],
                                      '%a %b %d %H:%M:%S +0000 %Y')))

                document = {
                    'id': status['id'],
                    'created_at': timestamp,
                    'reference': url
                }
                news_collection.insert_one(document)
                logger.info('Insert ' + url + '  created at ' + str(timestamp))
                # if duration != 0 and int(round(time.time()) * 1000) - start > duration:
                #    break
        except Exception as e:
            logger.error(e)
        finally:
            # crawler_finish = True
            logger.info('Finish crawling')
            logger.info('Sleeping 5s to start again...')
            time.sleep(5)