def init_db(gatherer_db_uri, company_webpages_collection_name):

    global mongo_client, gatherer_database, company_webpages_collection

    mongo_client = MongoClient(gatherer_db_uri)
    gatherer_database = mongo_client.get_default_database()
    company_webpages_collection = gatherer_database[
        company_webpages_collection_name]
Beispiel #2
0
class MongoConnector:
    def __init__(self, host=None):
        if host is None:
            self.mongoClient = MongoClient(MONGODB_URI)
        else:
            self.mongoClient = MongoClient(MONGODB_LOCAL_URI)
        self.ipDBProd = self.mongoClient.get_default_database()
        self.ipDBDev = self.mongoClient.ipstats
        self.ipStats = self.mongoClient.ipstats

        self.ipDevBanners = self.ipDBDev.ips_dev

        self.ipsBanners = self.ipDBProd.ips_banners
        self.ipsHosts = self.ipDBProd.ips_hosts
        self.processedIps = self.ipDBProd.processed_ips
 def test_get_default_database_with_authsource(self):
     # Ensure we distinguish database name from authSource.
     uri = "mongodb://%s:%d/foo?authSource=src" % (host, port)
     c = MongoClient(uri, connect=False)
     self.assertEqual(Database(c, 'foo'), c.get_default_database())
 def test_get_default_database(self):
     c = MongoClient("mongodb://%s:%d/foo" % (host, port), connect=False)
     self.assertEqual(Database(c, 'foo'), c.get_default_database())
 def __init__(self, mongo_uri: str) -> None:
     client = MongoClient(mongo_uri)
     db = client.get_default_database()
     self.collection = db['advertisements']
Beispiel #6
0
 def test_get_default_database_with_authsource(self):
     # Ensure we distinguish database name from authSource.
     uri = "mongodb://%s:%d/foo?authSource=src" % (host, port)
     c = MongoClient(uri, _connect=False)
     self.assertEqual(Database(c, 'foo'), c.get_default_database())
Beispiel #7
0
 def test_get_default_database(self):
     c = MongoClient("mongodb://%s:%d/foo" % (host, port), _connect=False)
     self.assertEqual(Database(c, 'foo'), c.get_default_database())
Beispiel #8
0
class CompanyWebSiteSpider(scrapy.Spider):

    name = "website_crawler"

    EMAIL_RE = r'[\w\.-]+@[\w\.-]+'

    speciality = None

    mongo_client = None
    gatherer_database = None

    nlp = None
    dictionary = PyDictionary()

    stop_words = None

    words_to_find = None

    verbs_to_find = None

    def __init__(self, speciality=None, *args, **kwargs):
        super(CompanyWebSiteSpider, self).__init__(*args, **kwargs)

        self.logger.logger.setLevel(logging.INFO)

        if speciality is None:
            raise Exception("The speciality must be informed")

        self.speciality = speciality

        nltk.data.path.append(
            '/Users/xalperte/BigMLDev/company_web_scrapy/webscrawler/nltk_data'
        )

        self.stop_words = stopwords.words('english')

        self.nlp = spacy.load('en')

    def init(self):

        self.mongo_client = MongoClient(
            self.settings.get('MONGO_GATHERER_BD_URI'))
        self.gatherer_database = self.mongo_client.get_default_database()

        self.prepare_words_to_find()
        self.prepare_verbs_to_find()

    def start_requests(self):

        self.init()

        companies_by_id = self.load_companies()
        company_num = 0
        current_company = None
        try:
            # companies_by_id = self.load_jim_companies()

            for company_id, company in companies_by_id.iteritems():
                current_company = company

                if 'webpage' in company:
                    company_num += 1
                    self.logger.info(
                        "[%d] Launch Home Page Request for %s - %s - %s - %s" %
                        (company_num, company['webpage'],
                         company['company_id'], company['company_name'],
                         company['webpage']))
                    yield scrapy.Request(url=company['webpage'],
                                         meta={
                                             'url':
                                             company['webpage'],
                                             'company_id':
                                             company['company_id'],
                                             'company_name':
                                             company['company_name'],
                                             'company_num':
                                             company_num,
                                             'company_home_page':
                                             company['webpage']
                                         },
                                         errback=self.error,
                                         callback=self.parse_website)

                    # company_num += 1
                    # if company_num == 10:
                    #     break
                else:
                    self.logger.warning("The company [%s] "
                                        "doesn't have webpage infomed" %
                                        company_id)
        except Exception as e:
            self.logger.error(
                "Error processing company at [%d]. Company ID: [%s]. Cause [%s]"
                % (company_num, current_company['company_id'], repr(e)))

    def parse_website(self, response):

        try:
            self.logger.info(
                "[%d] Parsing Home Page from %s - %s - %s - %s" %
                (response.meta['company_num'], response.url,
                 response.meta['company_id'], response.meta['company_name'],
                 response.meta['company_home_page']))

            self.update_company_page(response)

            home_url = urlparse.urlparse(response.url)
            home_netloc = home_url.netloc.lower()

            # Following only the links to the company website. Forget about the
            # links to other websites.
            processed_links = set()
            requested_links = 0
            for link_data in self.get_links(self.guess_root(response.url),
                                            response.body):
                if link_data[1] not in processed_links:
                    processed_links.add(link_data[1])

                    link_url = urlparse.urlparse(link_data[1])
                    link_netloc = link_url.netloc.lower()

                    if home_netloc == link_netloc:
                        if self.follow_link(link_data):
                            requested_links += 1

                            # Only X links to follow
                            if requested_links > 10:
                                break

                            yield scrapy.Request(
                                url=link_data[1],
                                meta={
                                    'url_name':
                                    link_data[0],
                                    'url':
                                    link_data[1],
                                    'company_id':
                                    response.meta['company_id'],
                                    'company_name':
                                    response.meta['company_name'],
                                    'company_home_page':
                                    response.meta['company_home_page']
                                },
                                errback=self.error,
                                callback=self.parse_internal_page)

                            self.logger.debug(
                                "\t Link to follow: [%s] - [%s]" %
                                (link_data[0], link_data[1]))
                        else:
                            self.logger.debug(
                                "\t NOT Link to follow (not words in the link): [%s] - [%s]."
                                % (link_data[0], link_data[1]))
                    else:
                        self.logger.debug(
                            "\t NOT Link to follow (out of the company website): [%s] - [%s]. "
                            "Home Netloc [%s] - Link Netloc [%s]" %
                            (link_data[0], link_data[1], home_netloc,
                             link_netloc))

        except Exception as e:
            self.logger.error(
                "Home page parsing exception. URL [%s]. Cause [%s]" %
                (response.url, repr(e)))

    def parse_internal_page(self, response):

        try:
            self.logger.debug(
                "Parsing Internal Page from %s - %s - %s - %s - %s" %
                (response.meta['url_name'], response.url,
                 response.meta['company_id'], response.meta['company_name'],
                 response.meta['company_home_page']))

            self.update_company_page(response,
                                     url_name=response.meta['url_name'],
                                     is_home=False)

        except Exception as e:
            self.logger.error(
                "Internal page parsing exception. URL [%s]. Cause [%s]" %
                (response.url), repr(e))

    def follow_link(self, link_data):
        for word in self.words_to_find:
            if word in link_data[0] or \
                            word in link_data[2] or \
                            word in link_data[3]:
                return True
        return False

    def error(self, failure):
        # log all failures
        self.logger.error("Request Error!")
        self.logger.error(repr(failure))

        # in case you want to do something special for some errors,
        # you may need the failure's type:
        if failure.check(HttpError):
            # these exceptions come from HttpError spider middleware
            # you can get the non-200 response
            response = failure.value.response
            self.logger.error('HttpError on %s', response.url)
            self.write_wrong_website('HTTP', response.meta['company_id'],
                                     response.meta['company_name'],
                                     response.url, repr(failure))

        elif failure.check(DNSLookupError):
            # this is the original request
            request = failure.request
            self.logger.error('DNSLookupError on %s', request.url)
            self.write_wrong_website('DNSLookup', request.meta['company_id'],
                                     request.meta['company_name'], request.url,
                                     repr(failure))

        elif failure.check(TimeoutError, TCPTimedOutError):
            request = failure.request
            self.logger.error('TimeoutError on %s', request.url)
            self.write_wrong_website('Timeout', request.meta['company_id'],
                                     request.meta['company_name'], request.url,
                                     repr(failure))

    def get_links(self, root, html_doc):
        soup = BeautifulSoup(html_doc, 'html.parser')
        return self.resolve_links(root, soup.find_all('a', href=True))

    def guess_root(self, base_url):
        if base_url.startswith('http'):
            parsed_link = urlparse.urlparse(base_url)
            scheme = parsed_link.scheme + '://'
            netloc = parsed_link.netloc
            return scheme + netloc

    def resolve_links(self, root, links):
        for link in links:
            link_title = link.get_text()
            link_href = link['href']
            if not link_href.startswith('http'):
                link_href = urlparse.urljoin(root, link_href)

            # Bad urls (email attached to the url)
            match = re.findall(self.EMAIL_RE, link_href)
            if match and len(match) > 0:
                for email in match:
                    link_href = link_href.replace(email, '')

            if link_href.endswith('/admin'):
                link_href = link_href.replace('/admin', '')

            yield (link_title, link_href, urlparse.urlparse(link_href).path,
                   urlparse.urlparse(link_href).query)

    def prepare_verbs_to_find(self):

        base_verbs = [
            'give', 'offer', 'contribute', 'administer', 'bring', 'provide',
            'supply', 'manufacture', 'produce', 'automate', 'commodity',
            'sell', 'solve', 'build'
        ]

        self.verbs_to_find = set()
        for word in base_verbs:
            self.verbs_to_find.add(word)
            for idx, synset in enumerate(wordnet.synsets(word)):
                for synonym in synset.lemma_names():
                    self.verbs_to_find.add(synonym.replace('_', ' '))

                    # hypernyms = [l.lemma_names() for l in synset.hypernyms()]
                    # for hypernym in hypernyms:
                    #     for word in hypernym:
                    #         self.verbs_to_find.add(word.replace('_', ' '))
                    #
                    # hyponyms = [l.lemma_names() for l in synset.hyponyms()]
                    # for hyponym in hyponyms:
                    #     for word in hyponym:
                    #         self.verbs_to_find.add(word.replace('_', ' '))

        stop_verbs = set([
            'get', 'have', 'be', 'add', 'work', 'reach', 'open', 'create',
            'take', 'break'
        ])

        self.verbs_to_find = self.verbs_to_find.difference(stop_verbs)

    def prepare_words_to_find(self):

        all_speciality_words = set()
        for idx, synset in enumerate(wordnet.synsets(self.speciality)):
            for synonym in synset.lemma_names():
                all_speciality_words.add(synonym.replace('_', ' '))

            # hypernyms = [l.lemma_names() for l in synset.hypernyms()]
            # for hypernym in hypernyms:
            #     for word in hypernym:
            #         all_speciality_words.add(word.replace('_', ' '))
            #
            # hyponyms = [l.lemma_names() for l in synset.hyponyms()]
            # for hyponym in hyponyms:
            #     for word in hyponym:
            #         all_speciality_words.add(word.replace('_', ' '))

        # Words related to the maket
        market_words = [
            'mechanics', 'unfolding', 'marketplace', 'deploy', 'give',
            'contribute', 'administer', 'bring', 'service', 'result',
            'technology', 'market', 'use', 'compose', 'prepare', 'provide',
            'make', 'support', 'business', 'supply', 'manufacture', 'product',
            'robotics', 'ability', 'form', 'automate', 'produce', 'about',
            'resource', 'commodity', 'vend', 'wholesale', 'work', 'solution',
            'duty', 'retail', 'display', 'mission', 'vision'
        ]

        all_market_words = set()
        for word in market_words:
            all_market_words.add(word)
            for idx, synset in enumerate(wordnet.synsets(word)):
                for synonym in synset.lemma_names():
                    all_market_words.add(synonym.replace('_', ' '))

            # hypernyms = [l.lemma_names() for l in synset.hypernyms()]
            # for hypernym in hypernyms:
            #     for word in hypernym:
            #         all_market_words.add(word.replace('_', ' '))
            #
            # hyponyms = [l.lemma_names() for l in synset.hyponyms()]
            # for hyponym in hyponyms:
            #     for word in hyponym:
            #         all_market_words.add(word.replace('_', ' '))

        communication_words = [
            'disclosure', 'communication', 'article', 'announcement', 'story',
            'record', 'blog', 'intelligence', 'journal', 'advice', 'diary',
            'news', 'forum'
        ]

        all_communication_words = set()
        for word in communication_words:
            all_communication_words.add(word)
            for idx, synset in enumerate(wordnet.synsets(word)):
                for synonym in synset.lemma_names():
                    all_communication_words.add(synonym.replace('_', ' '))

            # hypernyms = [l.lemma_names() for l in synset.hypernyms()]
            # for hypernym in hypernyms:
            #     for word in hypernym:
            #         all_communication_words.add(word.replace('_', ' '))
            #
            # hyponyms = [l.lemma_names() for l in synset.hyponyms()]
            # for hyponym in hyponyms:
            #     for word in hyponym:
            #         all_communication_words.add(word.replace('_', ' '))

        special_words = set('rss')

        self.logger.debug("Speciality Words to find: [%s]" %
                          ','.join(all_speciality_words))
        self.logger.debug("Communication Words to find: [%s]" %
                          ','.join(all_communication_words))
        self.logger.debug("Market Words to find: [%s]" %
                          ','.join(all_market_words))
        self.logger.debug("Commons Words between sections: [%s]" % ','.join(
            set.intersection(all_speciality_words, all_market_words,
                             all_communication_words)))

        self.words_to_find = set.union(all_speciality_words, all_market_words,
                                       all_communication_words, special_words)

    def write_wrong_website(self, type, company_id, company_name, company_url,
                            error):
        with open('error-%s.txt' % type, 'ab') as f:
            f.write("\"%s\",\"%s\",\"%s\",\"%s\"\n" %
                    (company_id, company_name, company_url, error))

    def write_wrong_specialty(self, company_id, company_name, company_url):
        with open('wrong-specialty-%s.txt' % self.speciality, 'ab') as f:
            f.write("\"%s\",\"%s\",\"%s\"\n" %
                    (company_id, company_name, company_url))

    def update_company_page(self, response, url_name='Home', is_home=True):

        companies_pages = \
            self.gatherer_database['company_webpages']

        soup = BeautifulSoup(response.body)

        # Remove the script and style tags
        [x.extract() for x in soup.findAll('script')]
        [x.extract() for x in soup.findAll('style')]
        [x.extract() for x in soup.select('[style*="visibility:hidden"]')]
        [x.extract() for x in soup.select('[style*="display:none"]')]

        page_text = soup.get_text()

        # strip empty lines
        page_text = "".join(
            [s for s in page_text.strip().splitlines(True) if s.strip()])

        if self.speciality not in page_text and is_home:
            self.write_wrong_specialty(response.meta['company_id'],
                                       response.meta['company_name'],
                                       response.url)

        keywords = self.get_page_meta('keywords', response.body)
        if keywords is not None:
            keywords = [keyword.strip() for keyword in keywords.split(',')]

        title = self.get_page_meta('title', response.body)
        title_bags_of_words = self.decompose_sentences(
            [title] if title is not None else None)

        description = self.get_page_meta('description', response.body)
        description_bags_of_words = self.decompose_sentences(
            [description] if description is not None else None)

        abstract = self.get_page_meta('abstract', response.body)
        abstract_bags_of_words = self.decompose_sentences(
            [abstract] if abstract is not None else None)

        sentences = self.find_sentences(page_text)
        sentences = self.decompose_sentences(sentences)

        companies_pages.update(
            {
                'company_id': response.meta['company_id'],
                'url': response.url
            }, {
                '$setOnInsert': {
                    'company_id': response.meta['company_id'],
                    'url': response.url,
                    'created': datetime.datetime.now()
                },
                '$set': {
                    'updated': datetime.datetime.now(),
                    'url_name': url_name.strip(),
                    'company_name': response.meta['company_name'],
                    'specialty': self.speciality,
                    'title': title,
                    'description': description,
                    'abstract': abstract,
                    'keywords': keywords,
                    'is_home': is_home,
                    'content': soup.prettify(),
                    'content_plain_text': page_text,
                    'sentences': sentences,
                    'bags_of_words_in_meta': {
                        'title': title_bags_of_words,
                        'description': description_bags_of_words,
                        'abstract': abstract_bags_of_words,
                        'keywords': keywords
                    }
                }
            },
            upsert=True)

    def find_sentences(self, page_content):
        # http://stackoverflow.com/questions/36610179/how-to-get-the-dependency-tree-with-spacy/36612605

        # page_content = page_content.lower()

        # lines = page_content.split('\n')

        # remove lines with less than 4 words
        # processed_text = ""
        # for line in page_content.split('\n'):
        #     if line.split(' ') >= 4:
        #         processed_text += line

        if isinstance(page_content, str):
            page_content = page_content.decode('utf-8')

        doc = self.nlp(page_content.replace('\n', '.\n'))

        sents = set()

        for sent in doc.sents:
            for token in sent:

                # Phasal verb?
                if token.dep_ == "prt" and token.head.pos_ == "VERB":
                    verb = token.head.orth_
                    particle = token.orth_
                    phrasal_verb = ' '.join([verb, particle])
                    if phrasal_verb in self.verbs_to_find:
                        sents.add(sent.string)

                elif token.pos == VERB and \
                    token.lemma_ in self.verbs_to_find:
                    sents.add(sent.string)

        return list(sents) if len(list(sents)) > 0 else None
        #     for token in sent:
        #         if token.is_alpha:
        #
        # sentences_list = []
        # for line in lines:
        #     sentences_list.extend(word_tokenize(sentence) for sentence in sent_tokenize(line))
        #
        # parser = nltk.ChartParser(gro)
        # for sentence in sentences_list:
        #

        # return sentences_list

    def decompose_sentences(self, sentences):
        """
        For each sentence we are going to create different bags of words based
        on the kind of the meaning/content:

            entities: {
                persons:
                organizations:
                locations:
                products:
                events:
                work_of_art:
                languages:
            },

            noun_chunks: {
            }

            nouns: {
            }

            verbs: {
            }

        Supported entities

            PERSON	People, including fictional.
            NORP	Nationalities or religious or political groups.
            FACILITY	Buildings, airports, highways, bridges, etc.
            ORG	Companies, agencies, institutions, etc.
            GPE	Countries, cities, states.
            LOC	Non-GPE locations, mountain ranges, bodies of water.
            PRODUCT	Objects, vehicles, foods, etc. (Not services.)
            EVENT	Named hurricanes, battles, wars, sports events, etc.
            WORK_OF_ART	Titles of books, songs, etc.
            LANGUAGE	Any named language.

        :param sentences: the list of sentences
        :return: a dictionary with the different types of bags of words
        """

        sentences_list = []

        if sentences is not None:
            for sentence in sentences:

                sentence_data = {'sentence': sentence, 'bags_of_words': None}

                bags_of_words = {
                    'all': [],
                    'entities': {
                        'PERSON': [],
                        'NORP': [],
                        'FACILITY': [],
                        'ORG': [],
                        'GPE': [],
                        'LOC': [],
                        'PRODUCT': [],
                        'EVENT': [],
                        'WORK_OF_ART': [],
                        'LANGUAGE': []
                    },
                    'noun_chunks': [],
                    'VERB': [],
                    'NOUN': []
                }

                doc = self.nlp(sentence if isinstance(sentence, unicode) else
                               sentence.decode('utf-8'))

                # process the entities of the sentence
                for entity in doc.ents:
                    if entity.label_ in [
                            'PERSON', 'NORP', 'FACILITY', 'ORG', 'GPE', 'LOC',
                            'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LANGUAGE'
                    ]:
                        entity_val = self.clean_stopwords(
                            entity.string.lower())
                        if len(entity_val.strip()) > 0:
                            synonyms = self.get_synonyms(sentence, entity_val)
                            bags_of_words['entities'][entity.label_].append(
                                (entity_val, synonyms))
                            bags_of_words['all'].append((entity_val, synonyms))

                # process the noun chunks of the sentence
                for noun_chunk in doc.noun_chunks:
                    # Lemmatizing the nouns (steamming)
                    noun_chunk_val = self.clean_stopwords(
                        noun_chunk.lemma_.lower())
                    if len(noun_chunk_val.strip()) > 0:
                        synonyms = self.get_synonyms(sentence, noun_chunk_val)

                        bags_of_words['noun_chunks'].append(
                            (noun_chunk_val, synonyms))
                        bags_of_words['all'].append((noun_chunk_val, synonyms))

                # process verbs and nouns of the sentence
                for word in doc:
                    if word.pos_ in ['VERB', 'NOUN']:
                        # Lemmatizing the words (stemming)
                        word_val = self.clean_stopwords(word.lemma_.lower())
                        if len(word_val.strip()) > 0:
                            synonyms = self.get_synonyms(sentence, word_val)

                            bags_of_words[word.pos_].append(
                                (word_val, synonyms))
                            bags_of_words['all'].append((word_val, synonyms))

                sentence_data['bags_of_words'] = bags_of_words
                sentences_list.append(sentence_data)

        return sentences_list

    def clean_stopwords(self, text):
        return ' '.join(
            [w for w in text.split(' ') if w.lower() not in self.stop_words])

    def get_synonyms(self, sentence, word):
        from pywsd.lesk import simple_lesk

        synonyms = set()

        if isinstance(sentence, str):
            sentence = sentence.decode('utf-8')

        if isinstance(word, str):
            word = word.decode('utf-8')

        synset = simple_lesk(sentence, word)
        if synset is not None:
            for synonym in synset.lemma_names():
                synonyms.add(synonym.replace('_', ' '))

        # for idx, synset in enumerate(wordnet.synsets(word)):
        #     for synonym in synset.lemma_names():
        #         synonyms.add(synonym.replace('_', ' '))

        return list(synonyms)

    def get_page_meta(self, meta_name, page_html):
        soup = BeautifulSoup(page_html)

        value = ""
        for meta in soup.findAll("meta"):
            metaname = meta.get('name', '').lower()
            metaprop = meta.get('property', '').lower()
            if meta_name == metaname or metaprop.find(meta_name) > 0:
                if 'content' in meta.__dict__['attrs']:
                    try:
                        value = ' '.join(
                            [value, meta['content'].strip().encode('utf-8')])
                    except:
                        self.logger.error(
                            "Error looking for [%s] in the metadata. Meta: [%s]"
                            % (meta_name, meta))
                        raise Exception(
                            "Error looking for [%s] in the metadata. Meta: [%s]"
                            % (meta_name, meta))

        return value.strip() if value != "" else None

    def load_jim_companies(self):
        companies_collection = \
            self.gatherer_database['consolidated_company']

        companies_by_id = {}

        with open('wp.txt', 'rb') as file:
            for i, line in enumerate(file):
                values = line.split(',')
                company_id = values[0].replace('"', '')
                webpage = values[1].replace('"', '')
                if len(webpage.strip()) > 0:
                    if webpage.startswith('http//'):
                        webpage = webpage.replace('http//', 'http://')
                    if webpage.startswith('https//'):
                        webpage = webpage.replace('https//', 'https://')
                    if not webpage.startswith(
                            'http://') and not webpage.startswith('https://'):
                        webpage = "http://%s" % webpage

                    companies_by_id[company_id] = {
                        'company_id': company_id,
                        'webpage': webpage,
                        'top_speciality': self.speciality,
                    }

        # Adding the website information from the consolidated_company collection
        companies_domain = companies_collection.find(
            {
                "company_id": {
                    '$in': [
                        company['company_id']
                        for (company_id,
                             company) in companies_by_id.iteritems()
                    ]
                },
                "webpage": {
                    '$exists': True
                },
            }, {
                "company_id": 1,
                "company_name": 1
            })

        for company in companies_domain:
            companies_by_id[company['company_id']]['company_name'] = company[
                'company_name']

        self.logger.info("Companies with website informed: [%d]" %
                         len(companies_by_id))

        return companies_by_id

    def load_companies(self):

        companies_collection = \
            self.gatherer_database['consolidated_company']
        companies_top_speciality_collection = \
            self.gatherer_database['company_specialities']

        # Looking for all the companies in Machine Learning
        companies = companies_top_speciality_collection.find(
            {"tf_idf.top_tag": self.speciality}, {
                "company_id": 1,
                "tf_idf.top_tag": 1
            })

        companies_by_id = {}

        for company in companies:
            companies_by_id[company['company_id']] = {
                'company_id': company['company_id'],
                'top_speciality': company['tf_idf']['top_tag']
            }

        # Adding the website information from the consolidated_company collection
        companies_domain = companies_collection.find(
            {
                "company_id": {
                    '$in': [
                        company['company_id']
                        for (company_id,
                             company) in companies_by_id.iteritems()
                    ]
                },
                "webpage": {
                    '$exists': True
                },
            }, {
                "company_id": 1,
                "company_name": 1,
                "webpage": 1
            })

        for company in companies_domain:
            companies_by_id[company['company_id']]['company_name'] = company[
                'company_name']
            webpage = company['webpage']
            if webpage.startswith('http//'):
                webpage = webpage.replace('http//', 'http://')
            if webpage.startswith('https//'):
                webpage = webpage.replace('https//', 'https://')
            if not webpage.startswith('http://') and not webpage.startswith(
                    'https://'):
                webpage = "http://%s" % webpage

            companies_by_id[company['company_id']]['webpage'] = webpage

        self.logger.info("Companies with website informed: [%d]" %
                         len(companies_by_id))

        return companies_by_id
Beispiel #9
0
def compute_aggregations(source_db_uri, source_collection, changed_intervals,
                         process_field, interval, operators_list,
                         field_names_list, resource_type):
    """
        Generation of the pipeline aggregation string, execution
        (1 aggregated result), and returns the changed_intervals
        dictionary upgraded with the result document

        Ex.

        [{"$match": {
            "metric_name":"twitter_followers",
            "company_id": "2341231231",
            "date" {"$gte": lower_date, "$lt": upper_date}}},
         {"$project": {
              "date": "$date",
              "value": "$value"}},
         {"$sort": {"date": 1}},
         {"$group": {"_id": {"company_id": "$_id"},
                     "aggr_field_name_1": { "$last": "$value" },
                     "aggr_field_name_2": { "$sum": "$value" },
                     "aggr_field_name_3": { "$avg": "$value" }}},
         {"$project":
             {"_id": 0,
              "company_id": "$_id.company_id",
              "aggr_field_name_1": "aggr_field_name_1",
              "aggr_field_name_2": "aggr_field_name_2"
              "aggr_field_name_3": "aggr_field_name_3"}}]

         Returns a result document attached to the changed_intervals document:

         [{"company_id": "2341231231", "year": 2013, "interval": 7},
            "result": {"twitter_followers_last": 456,
                       "twitter_followers_first": 123,
                       "twitter_followers_count": 34
                       ...},
          {"company_id": "2341231231", "year": 2013, "interval": 8},
             "result": {"twitter_followers_last": 135},
          {"company_id": "2341231444", "year": 2015, "interval": 9},
             "result": {"twitter_followers_last": 1023},
          {"company_id": "2341231444", "year": 2015, "interval": 11},
             "result": {"twitter_followers_last": 1050},
         ...]

    :param source_db_uri: Source DB URI, i.e. mongodb://localhost/databasename
    :param source_collection: Input collection name
    :param proces_field: field to process "twitter_followers", "twitter_bio",...
    :param changed_intervals: List of documents with the changed intervals
    :param interval: period type --> year, quarter, month, dayOfYear, week
    :param operators_list: aggregator operator list i.e. "last sum avg"
    :param field_names_list: aggregator field name list i.e
    "twitter_followers_last twitter_followers_sum twitter_followers_avg"
    """

    client = MongoClient(source_db_uri)
    database = client.get_default_database()
    collection = database[source_collection]

    print "Processing %d changed intervals...." % len(changed_intervals)
    NUM_OF_INTERVALS_TO_INFORM = 1000
    for idx, changed_interval in enumerate(changed_intervals):
        lower_date, upper_date = \
            interval_date_range(changed_interval, interval)

        # Base pipeline aggregation list
        pipeline_aggregated = [{
            "$match": {
                "%s_id" % resource_type:
                changed_interval['%s_id' % resource_type]
            }
        }, {
            "$unwind": "$%s_ts" % process_field
        }, {
            "$project": {
                "%s_id" % resource_type:
                "$%s_id" % resource_type,
                "value":
                "$%s_ts.value" % process_field,
                "date":
                "$%s_ts.date" % process_field,
                "updated":
                "$%s_ts.updated" % process_field,
                "year": {
                    "$year": "$%s_ts.date" % process_field
                },
                "interval":
                set_field_interval("%s_ts.date" % process_field, interval)
            }
        }, {
            "$match": {
                "date": {
                    "$gte": lower_date,
                    "$lt": upper_date
                }
            }
        }, {
            "$project": {
                "date": "$date",
                "value": "$value",
                "%s_id" % resource_type: "$%s_id" % resource_type
            }
        }, {
            "$sort": {
                "date": 1
            }
        }, {
            "$group": {
                "_id": {
                    "%s_id" % resource_type: "%s_id" % resource_type
                }
            }
        }, {
            "$project": {
                "_id": 0,
                "%s_id" % resource_type: "$_id.%s_id" % resource_type
            }
        }]

        # Upgrade pipeline aggregation list with aggregation operators
        for i in range(len(operators_list)):
            pipeline_aggregated[6]['$group'][field_names_list[i]] = \
                {"$" + operators_list[i]: "$value"} \
                    if (operators_list[i] != 'count') else {"$sum": 1}
            pipeline_aggregated[7]['$project'][field_names_list[i]] = \
                "$" + field_names_list[i]

        # Upgrade changed_interval document with result
        for aggregated_value in collection.aggregate(pipeline_aggregated,
                                                     allowDiskUse=True):
            changed_interval['result'] = {}
            for i in range(len(operators_list)):
                changed_interval['result'][field_names_list[i]] = \
                    aggregated_value[field_names_list[i]]

        if idx != 0 and idx % NUM_OF_INTERVALS_TO_INFORM == 0:
            print "%d intervals processed" % \
               ((idx / NUM_OF_INTERVALS_TO_INFORM) * NUM_OF_INTERVALS_TO_INFORM)

    client.close()
    return changed_intervals
Beispiel #10
0
def compute_ts_aggregations_1toN(source_db_uri,
                                 source_collection,
                                 changed_intervals,
                                 process_field,
                                 date_field,
                                 interval,
                                 operators_list,
                                 field_names_list,
                                 resource_type,
                                 withinTheInterval=True):
    """
        Generation of the pipeline aggregation string, execution
        (1 aggregated result), and returns the changed_intervals
        dictionary upgraded with the result document

        Ex.

        [{"$match": {
            "company_id": "2341231231",
            "date" {"$gte": lower_date, "$lt": upper_date}}},
         {"$project": {
              "date": "$date",
              "value": "$value"}},
         {"$sort": {"date": 1}},
         {"$group": {"_id": {"company_id": "$_id"},
                     "aggr_field_name_1": { "$last": "$value" },
                     "aggr_field_name_2": { "$sum": "$value" },
                     "aggr_field_name_3": { "$avg": "$value" }}},
         {"$project":
             {"_id": 0,
              "company_id": "$_id.company_id",
              "aggr_field_name_1": "aggr_field_name_1",
              "aggr_field_name_2": "aggr_field_name_2"
              "aggr_field_name_3": "aggr_field_name_3"}}]

         Returns a result document attached to the changed_intervals document:

         [{"company_id": "2341231231", "year": 2013, "interval": 7},
            "result": {"twitter_followers_last": 456,
                       "twitter_followers_first": 123,
                       "twitter_followers_count": 34
                       ...},
          {"company_id": "2341231231", "year": 2013, "interval": 8},
             "result": {"twitter_followers_last": 135},
          {"company_id": "2341231444", "year": 2015, "interval": 9},
             "result": {"twitter_followers_last": 1023},
          {"company_id": "2341231444", "year": 2015, "interval": 11},
             "result": {"twitter_followers_last": 1050},
         ...]

    :param source_db_uri: Source DB URI, i.e. mongodb://localhost/databasename
    :param source_collection: Input collection name
    :param proces_field: field to process "twitter_followers", "twitter_bio",...
    :param changed_intervals: List of documents with the changed intervals
    :param interval: period type --> year, quarter, month, dayOfYear, week
    :param operators_list: aggregator operator list i.e. "last sum avg"
    :param field_names_list: aggregator field name list i.e
    "twitter_followers_last twitter_followers_sum twitter_followers_avg"
    :param resource_type: Resource Type (company, person, ...)
    :param withinTheInterval: within Interval
    """

    client = MongoClient(source_db_uri)
    database = client.get_default_database()
    collection = database[source_collection]

    company_processed = 0
    for changed_interval in changed_intervals:
        lower_date, upper_date = \
            interval_date_range(changed_interval, interval)

        # Base pipeline aggregation list
        pipeline_aggregated = [{
            "$match": {
                "%s_id" % resource_type:
                changed_interval['%s_id' % resource_type],
                date_field: {
                    "$lt": upper_date
                }
            }
        }, {
            "$project": {
                "id": "$%s_id" % resource_type,
                "date": "$%s" % date_field,
                "value": "$%s" % process_field
            }
        }, {
            "$sort": {
                "date": 1
            }
        }, {
            "$group": {
                "_id": {
                    "id": "$id"
                }
            }
        }, {
            "$project": {
                "_id": 0,
                'id': '$_id.id'
            }
        }]

        if withinTheInterval:
            pipeline_aggregated[0]['$match'][date_field]['$gte'] = lower_date

        # Upgrade pipeline aggregation list with aggregation operators
        for i in range(len(operators_list)):
            pipeline_aggregated[3]['$group'][field_names_list[i]] = \
                {"$" + operators_list[i]: "$value"} \
                    if (operators_list[i] != 'count') else {"$sum": 1}
            pipeline_aggregated[4]['$project'][field_names_list[i]] = \
                "$" + field_names_list[i]

        # Upgrade changed_interval document with result
        for aggregated_value in collection.aggregate(pipeline_aggregated,
                                                     allowDiskUse=True):
            changed_interval['result'] = {}
            for i in range(len(operators_list)):
                changed_interval['result'][field_names_list[i]] = \
                    aggregated_value[field_names_list[i]]

        company_processed += 1

        if company_processed % 100 == 0:
            print "Processed %d/%d companies" % (company_processed,
                                                 len(changed_intervals))

    client.close()

    return changed_intervals
Beispiel #11
0
def get_changed_intervals(source_db_uri, source_collection, last_execution,
                          curr_date, process_field, interval, resource_type):
    """
        Generation of the pipeline aggregation string, execution, and returns
        the periods with changes

        Ex.

        [{"$match":
            {"metric_name": {"$in": ["twitter_followers", "twitter_following"]},
             "updated": {"$gte": last_execution, "$lt": curr_date}}},
         { "$project":
             {"company_id": "$_id",
               "date": "$date",
               "year": {"$year": "$date"},
               "interval": {"$month": "$date"}}},
         {"$group":
             {"_id": { "company_id": "$company_id","year":"$year",
                        "interval": "$interval"}}},
         {"$project":  {"_id": 0, "company_id": "$_id.company_id",
                         "year": "$_id.year", "interval": "$_id.interval"}}]

         Returns a list of changed intervals with this structure:

         [{"company_id": ObjectID("2341231231"), "year": 2013, "interval": 7},
          {"company_id": ObjectID("2341231231"), "year": 2013, "interval": 8},
          {"company_id": ObjectID("2341231444"), "year": 2015, "interval": 9},
          {"company_id": ObjectID("2341231444"), "year": 2015, "interval": 11}
          ...]

    :param source_db_uri: Source DB URI, i.e. mongodb://localhost/databasename
    :param source_collection: Input collection name
    :param last_execution: Lower date
    :param curr_date: Upper date
    :param process_field: metric to process: "twitter_followers"
    :param interval: period type --> year, quarter, month, dayOfYear, week
    :param resource_type: Resource Type (company, person, ...)
    """

    client = MongoClient(source_db_uri)
    database = client.get_default_database()
    collection = database[source_collection]

    pipeline_periods = [{
        "$unwind": "$%s_ts" % process_field
    }, {
        "$project": {
            "%s_id" % resource_type: "$%s_id" % resource_type,
            "value": "$%s_ts.value" % process_field,
            "date": "$%s_ts.date" % process_field,
            "updated": "$%s_ts.updated" % process_field,
            "year": {
                "$year": "$%s_ts.date" % process_field
            },
            "interval": set_field_interval("%s_ts.date" % process_field,
                                           interval)
        }
    }, {
        "$match": {
            "updated": {
                "$lt": curr_date
            }
        }
    }, {
        "$group": {
            "_id": {
                "%s_id" % resource_type: "$%s_id" % resource_type,
                "year": "$year",
                "interval": "$interval"
            }
        },
    }, {
        "$project": {
            "_id": 0,
            "%s_id" % resource_type: "$_id.%s_id" % resource_type,
            "year": "$_id.year",
            "interval": "$_id.interval"
        }
    }]

    # If last_execution is informed, add the $gte clause to the "$match" elem.
    if last_execution is not None:
        pipeline_periods[2]['$match']["updated"]['$gte'] = last_execution

    changed_intervals = []

    for changed_interval in collection.aggregate(pipeline_periods,
                                                 allowDiskUse=True):
        changed_intervals.append(changed_interval)

    client.close()

    return changed_intervals
Beispiel #12
0
def update_preseries(target_db_uri, target_collection, source_db_uri,
                     source_collection, interval, is_prediction,
                     computed_aggregations, resource_type):
    """
        Update aggregated collection in the target system

    :param target_db_uri: Target DB URI, i.e. mongodb://localhost/databasename
    :param target_collection: Output collection name
    :param source_db_uri: Source DB URI, i.e. mongodb://localhost/databasename
    :param source_collection: Input collection name
    :param interval: period type --> year, quarter, month, dayOfYear, week
    :param is_prediction: Boolean with True or False
    :param computed_aggregations: Document with results
    """

    tgt_client = MongoClient(target_db_uri)
    tgt_db = tgt_client.get_default_database()
    tgt_col = tgt_db[target_collection]

    src_client = MongoClient(source_db_uri)
    src_db = src_client.get_default_database()
    src_col = src_db[source_collection]

    project_clause = {
        "_id": 0,
        "company_name": "$company_name",
        "company_foundation_date": "$foundation_date"
    }
    if resource_type == 'person':
        project_clause = {
            "_id": 0,
            "first_name": "$first_name",
            "last_name": "$last_name",
            "gender": "$gender"
        }
    if resource_type == 'investor':
        project_clause = {
            "_id": 0,
            "investor_name": "$investor_name",
            "investor_foundation_date": "$foundation_date"
        }

    bulk_counter = 0
    full_counter = 0
    block_size = 10000
    aggregation_num = len(computed_aggregations)
    tries_num = 3
    bulk = tgt_col.initialize_ordered_bulk_op()

    while full_counter < aggregation_num:

        comp_aggregation = computed_aggregations[full_counter]

        fields_to_insert, fields_to_update = \
            prepare_fields(src_col, project_clause, resource_type,
                           comp_aggregation, interval, is_prediction)

        find_pipeline = get_find_pipeline(resource_type, comp_aggregation,
                                          interval, fields_to_insert,
                                          is_prediction)

        bulk.find(find_pipeline).upsert().update({
            "$setOnInsert": fields_to_insert,
            "$set": fields_to_update
        })

        bulk_counter += 1
        full_counter += 1

        # Manage a page of block_size records
        if bulk_counter == block_size:
            try:
                bulk.execute()
                tries_num = 3
                bulk = tgt_col.initialize_ordered_bulk_op()
                bulk_counter = 0
                print "%d records processed" % full_counter
            except BulkWriteError as ex:  # give a second chance to the execute
                if tries_num == 0:
                    print "bulk.execute() failed 3 times..."
                    print "ERROR processing Task. Exception: [%s]" % ex
                    traceback.print_exc()
                    raise ex
                sleep(0.5)
                bulk = tgt_col.initialize_ordered_bulk_op()
                bulk_counter = 0
                full_counter -= block_size
                tries_num -= 1
            except Exception as ex2:
                print "ERROR processing Task. Exception: [%s]" % ex2
                traceback.print_exc()
                raise ex2

    # Manage rest of records from the latest complete page to the end
    if bulk_counter > 0:
        try:
            bulk.execute()
            print "%d records processed. Finished" % full_counter
        except BulkWriteError as ex:  # give a second chance to the execute
            sleep(1)
            bulk = tgt_col.initialize_ordered_bulk_op()
            full_counter = aggregation_num - bulk_counter
            for comp_aggr_inx in range(full_counter, aggregation_num):
                comp_aggregation = computed_aggregations[comp_aggr_inx]
                if len(comp_aggregation['result']) == 0:
                    continue

                fields_to_insert, fields_to_update = \
                    prepare_fields(src_col, project_clause, resource_type,
                           comp_aggregation, interval, is_prediction)

                find_pipeline = get_find_pipeline(resource_type,
                                                  comp_aggregation, interval,
                                                  fields_to_insert,
                                                  is_prediction)

                bulk.find(find_pipeline).upsert().update({
                    "$setOnInsert":
                    fields_to_insert,
                    "$set":
                    fields_to_update
                })

                full_counter += 1

            bulk.execute()
            print "%d records processed. Finished" % full_counter
        except Exception as ex:
            print "ERROR processing Task. Exception: [%s]" % ex
            traceback.print_exc()
            raise ex

    tgt_client.close()
    src_client.close()