def init_db(gatherer_db_uri, company_webpages_collection_name): global mongo_client, gatherer_database, company_webpages_collection mongo_client = MongoClient(gatherer_db_uri) gatherer_database = mongo_client.get_default_database() company_webpages_collection = gatherer_database[ company_webpages_collection_name]
class MongoConnector: def __init__(self, host=None): if host is None: self.mongoClient = MongoClient(MONGODB_URI) else: self.mongoClient = MongoClient(MONGODB_LOCAL_URI) self.ipDBProd = self.mongoClient.get_default_database() self.ipDBDev = self.mongoClient.ipstats self.ipStats = self.mongoClient.ipstats self.ipDevBanners = self.ipDBDev.ips_dev self.ipsBanners = self.ipDBProd.ips_banners self.ipsHosts = self.ipDBProd.ips_hosts self.processedIps = self.ipDBProd.processed_ips
def test_get_default_database_with_authsource(self): # Ensure we distinguish database name from authSource. uri = "mongodb://%s:%d/foo?authSource=src" % (host, port) c = MongoClient(uri, connect=False) self.assertEqual(Database(c, 'foo'), c.get_default_database())
def test_get_default_database(self): c = MongoClient("mongodb://%s:%d/foo" % (host, port), connect=False) self.assertEqual(Database(c, 'foo'), c.get_default_database())
def __init__(self, mongo_uri: str) -> None: client = MongoClient(mongo_uri) db = client.get_default_database() self.collection = db['advertisements']
def test_get_default_database_with_authsource(self): # Ensure we distinguish database name from authSource. uri = "mongodb://%s:%d/foo?authSource=src" % (host, port) c = MongoClient(uri, _connect=False) self.assertEqual(Database(c, 'foo'), c.get_default_database())
def test_get_default_database(self): c = MongoClient("mongodb://%s:%d/foo" % (host, port), _connect=False) self.assertEqual(Database(c, 'foo'), c.get_default_database())
class CompanyWebSiteSpider(scrapy.Spider): name = "website_crawler" EMAIL_RE = r'[\w\.-]+@[\w\.-]+' speciality = None mongo_client = None gatherer_database = None nlp = None dictionary = PyDictionary() stop_words = None words_to_find = None verbs_to_find = None def __init__(self, speciality=None, *args, **kwargs): super(CompanyWebSiteSpider, self).__init__(*args, **kwargs) self.logger.logger.setLevel(logging.INFO) if speciality is None: raise Exception("The speciality must be informed") self.speciality = speciality nltk.data.path.append( '/Users/xalperte/BigMLDev/company_web_scrapy/webscrawler/nltk_data' ) self.stop_words = stopwords.words('english') self.nlp = spacy.load('en') def init(self): self.mongo_client = MongoClient( self.settings.get('MONGO_GATHERER_BD_URI')) self.gatherer_database = self.mongo_client.get_default_database() self.prepare_words_to_find() self.prepare_verbs_to_find() def start_requests(self): self.init() companies_by_id = self.load_companies() company_num = 0 current_company = None try: # companies_by_id = self.load_jim_companies() for company_id, company in companies_by_id.iteritems(): current_company = company if 'webpage' in company: company_num += 1 self.logger.info( "[%d] Launch Home Page Request for %s - %s - %s - %s" % (company_num, company['webpage'], company['company_id'], company['company_name'], company['webpage'])) yield scrapy.Request(url=company['webpage'], meta={ 'url': company['webpage'], 'company_id': company['company_id'], 'company_name': company['company_name'], 'company_num': company_num, 'company_home_page': company['webpage'] }, errback=self.error, callback=self.parse_website) # company_num += 1 # if company_num == 10: # break else: self.logger.warning("The company [%s] " "doesn't have webpage infomed" % company_id) except Exception as e: self.logger.error( "Error processing company at [%d]. Company ID: [%s]. Cause [%s]" % (company_num, current_company['company_id'], repr(e))) def parse_website(self, response): try: self.logger.info( "[%d] Parsing Home Page from %s - %s - %s - %s" % (response.meta['company_num'], response.url, response.meta['company_id'], response.meta['company_name'], response.meta['company_home_page'])) self.update_company_page(response) home_url = urlparse.urlparse(response.url) home_netloc = home_url.netloc.lower() # Following only the links to the company website. Forget about the # links to other websites. processed_links = set() requested_links = 0 for link_data in self.get_links(self.guess_root(response.url), response.body): if link_data[1] not in processed_links: processed_links.add(link_data[1]) link_url = urlparse.urlparse(link_data[1]) link_netloc = link_url.netloc.lower() if home_netloc == link_netloc: if self.follow_link(link_data): requested_links += 1 # Only X links to follow if requested_links > 10: break yield scrapy.Request( url=link_data[1], meta={ 'url_name': link_data[0], 'url': link_data[1], 'company_id': response.meta['company_id'], 'company_name': response.meta['company_name'], 'company_home_page': response.meta['company_home_page'] }, errback=self.error, callback=self.parse_internal_page) self.logger.debug( "\t Link to follow: [%s] - [%s]" % (link_data[0], link_data[1])) else: self.logger.debug( "\t NOT Link to follow (not words in the link): [%s] - [%s]." % (link_data[0], link_data[1])) else: self.logger.debug( "\t NOT Link to follow (out of the company website): [%s] - [%s]. " "Home Netloc [%s] - Link Netloc [%s]" % (link_data[0], link_data[1], home_netloc, link_netloc)) except Exception as e: self.logger.error( "Home page parsing exception. URL [%s]. Cause [%s]" % (response.url, repr(e))) def parse_internal_page(self, response): try: self.logger.debug( "Parsing Internal Page from %s - %s - %s - %s - %s" % (response.meta['url_name'], response.url, response.meta['company_id'], response.meta['company_name'], response.meta['company_home_page'])) self.update_company_page(response, url_name=response.meta['url_name'], is_home=False) except Exception as e: self.logger.error( "Internal page parsing exception. URL [%s]. Cause [%s]" % (response.url), repr(e)) def follow_link(self, link_data): for word in self.words_to_find: if word in link_data[0] or \ word in link_data[2] or \ word in link_data[3]: return True return False def error(self, failure): # log all failures self.logger.error("Request Error!") self.logger.error(repr(failure)) # in case you want to do something special for some errors, # you may need the failure's type: if failure.check(HttpError): # these exceptions come from HttpError spider middleware # you can get the non-200 response response = failure.value.response self.logger.error('HttpError on %s', response.url) self.write_wrong_website('HTTP', response.meta['company_id'], response.meta['company_name'], response.url, repr(failure)) elif failure.check(DNSLookupError): # this is the original request request = failure.request self.logger.error('DNSLookupError on %s', request.url) self.write_wrong_website('DNSLookup', request.meta['company_id'], request.meta['company_name'], request.url, repr(failure)) elif failure.check(TimeoutError, TCPTimedOutError): request = failure.request self.logger.error('TimeoutError on %s', request.url) self.write_wrong_website('Timeout', request.meta['company_id'], request.meta['company_name'], request.url, repr(failure)) def get_links(self, root, html_doc): soup = BeautifulSoup(html_doc, 'html.parser') return self.resolve_links(root, soup.find_all('a', href=True)) def guess_root(self, base_url): if base_url.startswith('http'): parsed_link = urlparse.urlparse(base_url) scheme = parsed_link.scheme + '://' netloc = parsed_link.netloc return scheme + netloc def resolve_links(self, root, links): for link in links: link_title = link.get_text() link_href = link['href'] if not link_href.startswith('http'): link_href = urlparse.urljoin(root, link_href) # Bad urls (email attached to the url) match = re.findall(self.EMAIL_RE, link_href) if match and len(match) > 0: for email in match: link_href = link_href.replace(email, '') if link_href.endswith('/admin'): link_href = link_href.replace('/admin', '') yield (link_title, link_href, urlparse.urlparse(link_href).path, urlparse.urlparse(link_href).query) def prepare_verbs_to_find(self): base_verbs = [ 'give', 'offer', 'contribute', 'administer', 'bring', 'provide', 'supply', 'manufacture', 'produce', 'automate', 'commodity', 'sell', 'solve', 'build' ] self.verbs_to_find = set() for word in base_verbs: self.verbs_to_find.add(word) for idx, synset in enumerate(wordnet.synsets(word)): for synonym in synset.lemma_names(): self.verbs_to_find.add(synonym.replace('_', ' ')) # hypernyms = [l.lemma_names() for l in synset.hypernyms()] # for hypernym in hypernyms: # for word in hypernym: # self.verbs_to_find.add(word.replace('_', ' ')) # # hyponyms = [l.lemma_names() for l in synset.hyponyms()] # for hyponym in hyponyms: # for word in hyponym: # self.verbs_to_find.add(word.replace('_', ' ')) stop_verbs = set([ 'get', 'have', 'be', 'add', 'work', 'reach', 'open', 'create', 'take', 'break' ]) self.verbs_to_find = self.verbs_to_find.difference(stop_verbs) def prepare_words_to_find(self): all_speciality_words = set() for idx, synset in enumerate(wordnet.synsets(self.speciality)): for synonym in synset.lemma_names(): all_speciality_words.add(synonym.replace('_', ' ')) # hypernyms = [l.lemma_names() for l in synset.hypernyms()] # for hypernym in hypernyms: # for word in hypernym: # all_speciality_words.add(word.replace('_', ' ')) # # hyponyms = [l.lemma_names() for l in synset.hyponyms()] # for hyponym in hyponyms: # for word in hyponym: # all_speciality_words.add(word.replace('_', ' ')) # Words related to the maket market_words = [ 'mechanics', 'unfolding', 'marketplace', 'deploy', 'give', 'contribute', 'administer', 'bring', 'service', 'result', 'technology', 'market', 'use', 'compose', 'prepare', 'provide', 'make', 'support', 'business', 'supply', 'manufacture', 'product', 'robotics', 'ability', 'form', 'automate', 'produce', 'about', 'resource', 'commodity', 'vend', 'wholesale', 'work', 'solution', 'duty', 'retail', 'display', 'mission', 'vision' ] all_market_words = set() for word in market_words: all_market_words.add(word) for idx, synset in enumerate(wordnet.synsets(word)): for synonym in synset.lemma_names(): all_market_words.add(synonym.replace('_', ' ')) # hypernyms = [l.lemma_names() for l in synset.hypernyms()] # for hypernym in hypernyms: # for word in hypernym: # all_market_words.add(word.replace('_', ' ')) # # hyponyms = [l.lemma_names() for l in synset.hyponyms()] # for hyponym in hyponyms: # for word in hyponym: # all_market_words.add(word.replace('_', ' ')) communication_words = [ 'disclosure', 'communication', 'article', 'announcement', 'story', 'record', 'blog', 'intelligence', 'journal', 'advice', 'diary', 'news', 'forum' ] all_communication_words = set() for word in communication_words: all_communication_words.add(word) for idx, synset in enumerate(wordnet.synsets(word)): for synonym in synset.lemma_names(): all_communication_words.add(synonym.replace('_', ' ')) # hypernyms = [l.lemma_names() for l in synset.hypernyms()] # for hypernym in hypernyms: # for word in hypernym: # all_communication_words.add(word.replace('_', ' ')) # # hyponyms = [l.lemma_names() for l in synset.hyponyms()] # for hyponym in hyponyms: # for word in hyponym: # all_communication_words.add(word.replace('_', ' ')) special_words = set('rss') self.logger.debug("Speciality Words to find: [%s]" % ','.join(all_speciality_words)) self.logger.debug("Communication Words to find: [%s]" % ','.join(all_communication_words)) self.logger.debug("Market Words to find: [%s]" % ','.join(all_market_words)) self.logger.debug("Commons Words between sections: [%s]" % ','.join( set.intersection(all_speciality_words, all_market_words, all_communication_words))) self.words_to_find = set.union(all_speciality_words, all_market_words, all_communication_words, special_words) def write_wrong_website(self, type, company_id, company_name, company_url, error): with open('error-%s.txt' % type, 'ab') as f: f.write("\"%s\",\"%s\",\"%s\",\"%s\"\n" % (company_id, company_name, company_url, error)) def write_wrong_specialty(self, company_id, company_name, company_url): with open('wrong-specialty-%s.txt' % self.speciality, 'ab') as f: f.write("\"%s\",\"%s\",\"%s\"\n" % (company_id, company_name, company_url)) def update_company_page(self, response, url_name='Home', is_home=True): companies_pages = \ self.gatherer_database['company_webpages'] soup = BeautifulSoup(response.body) # Remove the script and style tags [x.extract() for x in soup.findAll('script')] [x.extract() for x in soup.findAll('style')] [x.extract() for x in soup.select('[style*="visibility:hidden"]')] [x.extract() for x in soup.select('[style*="display:none"]')] page_text = soup.get_text() # strip empty lines page_text = "".join( [s for s in page_text.strip().splitlines(True) if s.strip()]) if self.speciality not in page_text and is_home: self.write_wrong_specialty(response.meta['company_id'], response.meta['company_name'], response.url) keywords = self.get_page_meta('keywords', response.body) if keywords is not None: keywords = [keyword.strip() for keyword in keywords.split(',')] title = self.get_page_meta('title', response.body) title_bags_of_words = self.decompose_sentences( [title] if title is not None else None) description = self.get_page_meta('description', response.body) description_bags_of_words = self.decompose_sentences( [description] if description is not None else None) abstract = self.get_page_meta('abstract', response.body) abstract_bags_of_words = self.decompose_sentences( [abstract] if abstract is not None else None) sentences = self.find_sentences(page_text) sentences = self.decompose_sentences(sentences) companies_pages.update( { 'company_id': response.meta['company_id'], 'url': response.url }, { '$setOnInsert': { 'company_id': response.meta['company_id'], 'url': response.url, 'created': datetime.datetime.now() }, '$set': { 'updated': datetime.datetime.now(), 'url_name': url_name.strip(), 'company_name': response.meta['company_name'], 'specialty': self.speciality, 'title': title, 'description': description, 'abstract': abstract, 'keywords': keywords, 'is_home': is_home, 'content': soup.prettify(), 'content_plain_text': page_text, 'sentences': sentences, 'bags_of_words_in_meta': { 'title': title_bags_of_words, 'description': description_bags_of_words, 'abstract': abstract_bags_of_words, 'keywords': keywords } } }, upsert=True) def find_sentences(self, page_content): # http://stackoverflow.com/questions/36610179/how-to-get-the-dependency-tree-with-spacy/36612605 # page_content = page_content.lower() # lines = page_content.split('\n') # remove lines with less than 4 words # processed_text = "" # for line in page_content.split('\n'): # if line.split(' ') >= 4: # processed_text += line if isinstance(page_content, str): page_content = page_content.decode('utf-8') doc = self.nlp(page_content.replace('\n', '.\n')) sents = set() for sent in doc.sents: for token in sent: # Phasal verb? if token.dep_ == "prt" and token.head.pos_ == "VERB": verb = token.head.orth_ particle = token.orth_ phrasal_verb = ' '.join([verb, particle]) if phrasal_verb in self.verbs_to_find: sents.add(sent.string) elif token.pos == VERB and \ token.lemma_ in self.verbs_to_find: sents.add(sent.string) return list(sents) if len(list(sents)) > 0 else None # for token in sent: # if token.is_alpha: # # sentences_list = [] # for line in lines: # sentences_list.extend(word_tokenize(sentence) for sentence in sent_tokenize(line)) # # parser = nltk.ChartParser(gro) # for sentence in sentences_list: # # return sentences_list def decompose_sentences(self, sentences): """ For each sentence we are going to create different bags of words based on the kind of the meaning/content: entities: { persons: organizations: locations: products: events: work_of_art: languages: }, noun_chunks: { } nouns: { } verbs: { } Supported entities PERSON People, including fictional. NORP Nationalities or religious or political groups. FACILITY Buildings, airports, highways, bridges, etc. ORG Companies, agencies, institutions, etc. GPE Countries, cities, states. LOC Non-GPE locations, mountain ranges, bodies of water. PRODUCT Objects, vehicles, foods, etc. (Not services.) EVENT Named hurricanes, battles, wars, sports events, etc. WORK_OF_ART Titles of books, songs, etc. LANGUAGE Any named language. :param sentences: the list of sentences :return: a dictionary with the different types of bags of words """ sentences_list = [] if sentences is not None: for sentence in sentences: sentence_data = {'sentence': sentence, 'bags_of_words': None} bags_of_words = { 'all': [], 'entities': { 'PERSON': [], 'NORP': [], 'FACILITY': [], 'ORG': [], 'GPE': [], 'LOC': [], 'PRODUCT': [], 'EVENT': [], 'WORK_OF_ART': [], 'LANGUAGE': [] }, 'noun_chunks': [], 'VERB': [], 'NOUN': [] } doc = self.nlp(sentence if isinstance(sentence, unicode) else sentence.decode('utf-8')) # process the entities of the sentence for entity in doc.ents: if entity.label_ in [ 'PERSON', 'NORP', 'FACILITY', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LANGUAGE' ]: entity_val = self.clean_stopwords( entity.string.lower()) if len(entity_val.strip()) > 0: synonyms = self.get_synonyms(sentence, entity_val) bags_of_words['entities'][entity.label_].append( (entity_val, synonyms)) bags_of_words['all'].append((entity_val, synonyms)) # process the noun chunks of the sentence for noun_chunk in doc.noun_chunks: # Lemmatizing the nouns (steamming) noun_chunk_val = self.clean_stopwords( noun_chunk.lemma_.lower()) if len(noun_chunk_val.strip()) > 0: synonyms = self.get_synonyms(sentence, noun_chunk_val) bags_of_words['noun_chunks'].append( (noun_chunk_val, synonyms)) bags_of_words['all'].append((noun_chunk_val, synonyms)) # process verbs and nouns of the sentence for word in doc: if word.pos_ in ['VERB', 'NOUN']: # Lemmatizing the words (stemming) word_val = self.clean_stopwords(word.lemma_.lower()) if len(word_val.strip()) > 0: synonyms = self.get_synonyms(sentence, word_val) bags_of_words[word.pos_].append( (word_val, synonyms)) bags_of_words['all'].append((word_val, synonyms)) sentence_data['bags_of_words'] = bags_of_words sentences_list.append(sentence_data) return sentences_list def clean_stopwords(self, text): return ' '.join( [w for w in text.split(' ') if w.lower() not in self.stop_words]) def get_synonyms(self, sentence, word): from pywsd.lesk import simple_lesk synonyms = set() if isinstance(sentence, str): sentence = sentence.decode('utf-8') if isinstance(word, str): word = word.decode('utf-8') synset = simple_lesk(sentence, word) if synset is not None: for synonym in synset.lemma_names(): synonyms.add(synonym.replace('_', ' ')) # for idx, synset in enumerate(wordnet.synsets(word)): # for synonym in synset.lemma_names(): # synonyms.add(synonym.replace('_', ' ')) return list(synonyms) def get_page_meta(self, meta_name, page_html): soup = BeautifulSoup(page_html) value = "" for meta in soup.findAll("meta"): metaname = meta.get('name', '').lower() metaprop = meta.get('property', '').lower() if meta_name == metaname or metaprop.find(meta_name) > 0: if 'content' in meta.__dict__['attrs']: try: value = ' '.join( [value, meta['content'].strip().encode('utf-8')]) except: self.logger.error( "Error looking for [%s] in the metadata. Meta: [%s]" % (meta_name, meta)) raise Exception( "Error looking for [%s] in the metadata. Meta: [%s]" % (meta_name, meta)) return value.strip() if value != "" else None def load_jim_companies(self): companies_collection = \ self.gatherer_database['consolidated_company'] companies_by_id = {} with open('wp.txt', 'rb') as file: for i, line in enumerate(file): values = line.split(',') company_id = values[0].replace('"', '') webpage = values[1].replace('"', '') if len(webpage.strip()) > 0: if webpage.startswith('http//'): webpage = webpage.replace('http//', 'http://') if webpage.startswith('https//'): webpage = webpage.replace('https//', 'https://') if not webpage.startswith( 'http://') and not webpage.startswith('https://'): webpage = "http://%s" % webpage companies_by_id[company_id] = { 'company_id': company_id, 'webpage': webpage, 'top_speciality': self.speciality, } # Adding the website information from the consolidated_company collection companies_domain = companies_collection.find( { "company_id": { '$in': [ company['company_id'] for (company_id, company) in companies_by_id.iteritems() ] }, "webpage": { '$exists': True }, }, { "company_id": 1, "company_name": 1 }) for company in companies_domain: companies_by_id[company['company_id']]['company_name'] = company[ 'company_name'] self.logger.info("Companies with website informed: [%d]" % len(companies_by_id)) return companies_by_id def load_companies(self): companies_collection = \ self.gatherer_database['consolidated_company'] companies_top_speciality_collection = \ self.gatherer_database['company_specialities'] # Looking for all the companies in Machine Learning companies = companies_top_speciality_collection.find( {"tf_idf.top_tag": self.speciality}, { "company_id": 1, "tf_idf.top_tag": 1 }) companies_by_id = {} for company in companies: companies_by_id[company['company_id']] = { 'company_id': company['company_id'], 'top_speciality': company['tf_idf']['top_tag'] } # Adding the website information from the consolidated_company collection companies_domain = companies_collection.find( { "company_id": { '$in': [ company['company_id'] for (company_id, company) in companies_by_id.iteritems() ] }, "webpage": { '$exists': True }, }, { "company_id": 1, "company_name": 1, "webpage": 1 }) for company in companies_domain: companies_by_id[company['company_id']]['company_name'] = company[ 'company_name'] webpage = company['webpage'] if webpage.startswith('http//'): webpage = webpage.replace('http//', 'http://') if webpage.startswith('https//'): webpage = webpage.replace('https//', 'https://') if not webpage.startswith('http://') and not webpage.startswith( 'https://'): webpage = "http://%s" % webpage companies_by_id[company['company_id']]['webpage'] = webpage self.logger.info("Companies with website informed: [%d]" % len(companies_by_id)) return companies_by_id
def compute_aggregations(source_db_uri, source_collection, changed_intervals, process_field, interval, operators_list, field_names_list, resource_type): """ Generation of the pipeline aggregation string, execution (1 aggregated result), and returns the changed_intervals dictionary upgraded with the result document Ex. [{"$match": { "metric_name":"twitter_followers", "company_id": "2341231231", "date" {"$gte": lower_date, "$lt": upper_date}}}, {"$project": { "date": "$date", "value": "$value"}}, {"$sort": {"date": 1}}, {"$group": {"_id": {"company_id": "$_id"}, "aggr_field_name_1": { "$last": "$value" }, "aggr_field_name_2": { "$sum": "$value" }, "aggr_field_name_3": { "$avg": "$value" }}}, {"$project": {"_id": 0, "company_id": "$_id.company_id", "aggr_field_name_1": "aggr_field_name_1", "aggr_field_name_2": "aggr_field_name_2" "aggr_field_name_3": "aggr_field_name_3"}}] Returns a result document attached to the changed_intervals document: [{"company_id": "2341231231", "year": 2013, "interval": 7}, "result": {"twitter_followers_last": 456, "twitter_followers_first": 123, "twitter_followers_count": 34 ...}, {"company_id": "2341231231", "year": 2013, "interval": 8}, "result": {"twitter_followers_last": 135}, {"company_id": "2341231444", "year": 2015, "interval": 9}, "result": {"twitter_followers_last": 1023}, {"company_id": "2341231444", "year": 2015, "interval": 11}, "result": {"twitter_followers_last": 1050}, ...] :param source_db_uri: Source DB URI, i.e. mongodb://localhost/databasename :param source_collection: Input collection name :param proces_field: field to process "twitter_followers", "twitter_bio",... :param changed_intervals: List of documents with the changed intervals :param interval: period type --> year, quarter, month, dayOfYear, week :param operators_list: aggregator operator list i.e. "last sum avg" :param field_names_list: aggregator field name list i.e "twitter_followers_last twitter_followers_sum twitter_followers_avg" """ client = MongoClient(source_db_uri) database = client.get_default_database() collection = database[source_collection] print "Processing %d changed intervals...." % len(changed_intervals) NUM_OF_INTERVALS_TO_INFORM = 1000 for idx, changed_interval in enumerate(changed_intervals): lower_date, upper_date = \ interval_date_range(changed_interval, interval) # Base pipeline aggregation list pipeline_aggregated = [{ "$match": { "%s_id" % resource_type: changed_interval['%s_id' % resource_type] } }, { "$unwind": "$%s_ts" % process_field }, { "$project": { "%s_id" % resource_type: "$%s_id" % resource_type, "value": "$%s_ts.value" % process_field, "date": "$%s_ts.date" % process_field, "updated": "$%s_ts.updated" % process_field, "year": { "$year": "$%s_ts.date" % process_field }, "interval": set_field_interval("%s_ts.date" % process_field, interval) } }, { "$match": { "date": { "$gte": lower_date, "$lt": upper_date } } }, { "$project": { "date": "$date", "value": "$value", "%s_id" % resource_type: "$%s_id" % resource_type } }, { "$sort": { "date": 1 } }, { "$group": { "_id": { "%s_id" % resource_type: "%s_id" % resource_type } } }, { "$project": { "_id": 0, "%s_id" % resource_type: "$_id.%s_id" % resource_type } }] # Upgrade pipeline aggregation list with aggregation operators for i in range(len(operators_list)): pipeline_aggregated[6]['$group'][field_names_list[i]] = \ {"$" + operators_list[i]: "$value"} \ if (operators_list[i] != 'count') else {"$sum": 1} pipeline_aggregated[7]['$project'][field_names_list[i]] = \ "$" + field_names_list[i] # Upgrade changed_interval document with result for aggregated_value in collection.aggregate(pipeline_aggregated, allowDiskUse=True): changed_interval['result'] = {} for i in range(len(operators_list)): changed_interval['result'][field_names_list[i]] = \ aggregated_value[field_names_list[i]] if idx != 0 and idx % NUM_OF_INTERVALS_TO_INFORM == 0: print "%d intervals processed" % \ ((idx / NUM_OF_INTERVALS_TO_INFORM) * NUM_OF_INTERVALS_TO_INFORM) client.close() return changed_intervals
def compute_ts_aggregations_1toN(source_db_uri, source_collection, changed_intervals, process_field, date_field, interval, operators_list, field_names_list, resource_type, withinTheInterval=True): """ Generation of the pipeline aggregation string, execution (1 aggregated result), and returns the changed_intervals dictionary upgraded with the result document Ex. [{"$match": { "company_id": "2341231231", "date" {"$gte": lower_date, "$lt": upper_date}}}, {"$project": { "date": "$date", "value": "$value"}}, {"$sort": {"date": 1}}, {"$group": {"_id": {"company_id": "$_id"}, "aggr_field_name_1": { "$last": "$value" }, "aggr_field_name_2": { "$sum": "$value" }, "aggr_field_name_3": { "$avg": "$value" }}}, {"$project": {"_id": 0, "company_id": "$_id.company_id", "aggr_field_name_1": "aggr_field_name_1", "aggr_field_name_2": "aggr_field_name_2" "aggr_field_name_3": "aggr_field_name_3"}}] Returns a result document attached to the changed_intervals document: [{"company_id": "2341231231", "year": 2013, "interval": 7}, "result": {"twitter_followers_last": 456, "twitter_followers_first": 123, "twitter_followers_count": 34 ...}, {"company_id": "2341231231", "year": 2013, "interval": 8}, "result": {"twitter_followers_last": 135}, {"company_id": "2341231444", "year": 2015, "interval": 9}, "result": {"twitter_followers_last": 1023}, {"company_id": "2341231444", "year": 2015, "interval": 11}, "result": {"twitter_followers_last": 1050}, ...] :param source_db_uri: Source DB URI, i.e. mongodb://localhost/databasename :param source_collection: Input collection name :param proces_field: field to process "twitter_followers", "twitter_bio",... :param changed_intervals: List of documents with the changed intervals :param interval: period type --> year, quarter, month, dayOfYear, week :param operators_list: aggregator operator list i.e. "last sum avg" :param field_names_list: aggregator field name list i.e "twitter_followers_last twitter_followers_sum twitter_followers_avg" :param resource_type: Resource Type (company, person, ...) :param withinTheInterval: within Interval """ client = MongoClient(source_db_uri) database = client.get_default_database() collection = database[source_collection] company_processed = 0 for changed_interval in changed_intervals: lower_date, upper_date = \ interval_date_range(changed_interval, interval) # Base pipeline aggregation list pipeline_aggregated = [{ "$match": { "%s_id" % resource_type: changed_interval['%s_id' % resource_type], date_field: { "$lt": upper_date } } }, { "$project": { "id": "$%s_id" % resource_type, "date": "$%s" % date_field, "value": "$%s" % process_field } }, { "$sort": { "date": 1 } }, { "$group": { "_id": { "id": "$id" } } }, { "$project": { "_id": 0, 'id': '$_id.id' } }] if withinTheInterval: pipeline_aggregated[0]['$match'][date_field]['$gte'] = lower_date # Upgrade pipeline aggregation list with aggregation operators for i in range(len(operators_list)): pipeline_aggregated[3]['$group'][field_names_list[i]] = \ {"$" + operators_list[i]: "$value"} \ if (operators_list[i] != 'count') else {"$sum": 1} pipeline_aggregated[4]['$project'][field_names_list[i]] = \ "$" + field_names_list[i] # Upgrade changed_interval document with result for aggregated_value in collection.aggregate(pipeline_aggregated, allowDiskUse=True): changed_interval['result'] = {} for i in range(len(operators_list)): changed_interval['result'][field_names_list[i]] = \ aggregated_value[field_names_list[i]] company_processed += 1 if company_processed % 100 == 0: print "Processed %d/%d companies" % (company_processed, len(changed_intervals)) client.close() return changed_intervals
def get_changed_intervals(source_db_uri, source_collection, last_execution, curr_date, process_field, interval, resource_type): """ Generation of the pipeline aggregation string, execution, and returns the periods with changes Ex. [{"$match": {"metric_name": {"$in": ["twitter_followers", "twitter_following"]}, "updated": {"$gte": last_execution, "$lt": curr_date}}}, { "$project": {"company_id": "$_id", "date": "$date", "year": {"$year": "$date"}, "interval": {"$month": "$date"}}}, {"$group": {"_id": { "company_id": "$company_id","year":"$year", "interval": "$interval"}}}, {"$project": {"_id": 0, "company_id": "$_id.company_id", "year": "$_id.year", "interval": "$_id.interval"}}] Returns a list of changed intervals with this structure: [{"company_id": ObjectID("2341231231"), "year": 2013, "interval": 7}, {"company_id": ObjectID("2341231231"), "year": 2013, "interval": 8}, {"company_id": ObjectID("2341231444"), "year": 2015, "interval": 9}, {"company_id": ObjectID("2341231444"), "year": 2015, "interval": 11} ...] :param source_db_uri: Source DB URI, i.e. mongodb://localhost/databasename :param source_collection: Input collection name :param last_execution: Lower date :param curr_date: Upper date :param process_field: metric to process: "twitter_followers" :param interval: period type --> year, quarter, month, dayOfYear, week :param resource_type: Resource Type (company, person, ...) """ client = MongoClient(source_db_uri) database = client.get_default_database() collection = database[source_collection] pipeline_periods = [{ "$unwind": "$%s_ts" % process_field }, { "$project": { "%s_id" % resource_type: "$%s_id" % resource_type, "value": "$%s_ts.value" % process_field, "date": "$%s_ts.date" % process_field, "updated": "$%s_ts.updated" % process_field, "year": { "$year": "$%s_ts.date" % process_field }, "interval": set_field_interval("%s_ts.date" % process_field, interval) } }, { "$match": { "updated": { "$lt": curr_date } } }, { "$group": { "_id": { "%s_id" % resource_type: "$%s_id" % resource_type, "year": "$year", "interval": "$interval" } }, }, { "$project": { "_id": 0, "%s_id" % resource_type: "$_id.%s_id" % resource_type, "year": "$_id.year", "interval": "$_id.interval" } }] # If last_execution is informed, add the $gte clause to the "$match" elem. if last_execution is not None: pipeline_periods[2]['$match']["updated"]['$gte'] = last_execution changed_intervals = [] for changed_interval in collection.aggregate(pipeline_periods, allowDiskUse=True): changed_intervals.append(changed_interval) client.close() return changed_intervals
def update_preseries(target_db_uri, target_collection, source_db_uri, source_collection, interval, is_prediction, computed_aggregations, resource_type): """ Update aggregated collection in the target system :param target_db_uri: Target DB URI, i.e. mongodb://localhost/databasename :param target_collection: Output collection name :param source_db_uri: Source DB URI, i.e. mongodb://localhost/databasename :param source_collection: Input collection name :param interval: period type --> year, quarter, month, dayOfYear, week :param is_prediction: Boolean with True or False :param computed_aggregations: Document with results """ tgt_client = MongoClient(target_db_uri) tgt_db = tgt_client.get_default_database() tgt_col = tgt_db[target_collection] src_client = MongoClient(source_db_uri) src_db = src_client.get_default_database() src_col = src_db[source_collection] project_clause = { "_id": 0, "company_name": "$company_name", "company_foundation_date": "$foundation_date" } if resource_type == 'person': project_clause = { "_id": 0, "first_name": "$first_name", "last_name": "$last_name", "gender": "$gender" } if resource_type == 'investor': project_clause = { "_id": 0, "investor_name": "$investor_name", "investor_foundation_date": "$foundation_date" } bulk_counter = 0 full_counter = 0 block_size = 10000 aggregation_num = len(computed_aggregations) tries_num = 3 bulk = tgt_col.initialize_ordered_bulk_op() while full_counter < aggregation_num: comp_aggregation = computed_aggregations[full_counter] fields_to_insert, fields_to_update = \ prepare_fields(src_col, project_clause, resource_type, comp_aggregation, interval, is_prediction) find_pipeline = get_find_pipeline(resource_type, comp_aggregation, interval, fields_to_insert, is_prediction) bulk.find(find_pipeline).upsert().update({ "$setOnInsert": fields_to_insert, "$set": fields_to_update }) bulk_counter += 1 full_counter += 1 # Manage a page of block_size records if bulk_counter == block_size: try: bulk.execute() tries_num = 3 bulk = tgt_col.initialize_ordered_bulk_op() bulk_counter = 0 print "%d records processed" % full_counter except BulkWriteError as ex: # give a second chance to the execute if tries_num == 0: print "bulk.execute() failed 3 times..." print "ERROR processing Task. Exception: [%s]" % ex traceback.print_exc() raise ex sleep(0.5) bulk = tgt_col.initialize_ordered_bulk_op() bulk_counter = 0 full_counter -= block_size tries_num -= 1 except Exception as ex2: print "ERROR processing Task. Exception: [%s]" % ex2 traceback.print_exc() raise ex2 # Manage rest of records from the latest complete page to the end if bulk_counter > 0: try: bulk.execute() print "%d records processed. Finished" % full_counter except BulkWriteError as ex: # give a second chance to the execute sleep(1) bulk = tgt_col.initialize_ordered_bulk_op() full_counter = aggregation_num - bulk_counter for comp_aggr_inx in range(full_counter, aggregation_num): comp_aggregation = computed_aggregations[comp_aggr_inx] if len(comp_aggregation['result']) == 0: continue fields_to_insert, fields_to_update = \ prepare_fields(src_col, project_clause, resource_type, comp_aggregation, interval, is_prediction) find_pipeline = get_find_pipeline(resource_type, comp_aggregation, interval, fields_to_insert, is_prediction) bulk.find(find_pipeline).upsert().update({ "$setOnInsert": fields_to_insert, "$set": fields_to_update }) full_counter += 1 bulk.execute() print "%d records processed. Finished" % full_counter except Exception as ex: print "ERROR processing Task. Exception: [%s]" % ex traceback.print_exc() raise ex tgt_client.close() src_client.close()