def __init__(self, url, keywords, requested_by): self.requested_by = requested_by url = url + quote(keywords) super().__init__(url, keywords, '', callback=self.parse_search_result, requested_by=requested_by) self.lang = "fr" self.dbf = DBFace()
class DiplomatScrapper(StaticScrapper): def __init__(self, url, keywords, requested_by=None): self.requested_by = requested_by url_args = { 'key': "AIzaSyCVAXiUzRYsML1Pv6RwSG1gunmMikTzQqY", 'rsz': "filtered_cse", 'num': 10, 'hl': "en", 'prettyPrint': "false", 'source': "gcsc", 'gss': ".com", 'sig': "d5630e36052d1355ead71530c29be9ea", 'cx': "006972344228181832854:w07k6emi2wk", 'cse_tok': "ABPF6HibCVLLP6-x8toeGUn5PJY3CrbCXw:1526812940946", "q": keywords } super().__init__(url, keywords, url_args, callback=self.parse_search_result, requested_by=requested_by) self.lang = "en" self.dbf = DBFace() def parse_search_result(self, url, page_content, keywords): print("The Diplomat: got {} chars".format(len(page_content))) result = json.loads(page_content) if 'error' in result.keys(): print("TheDiplomatScrapper: " + result['error']['errors'][0]['message']) print("obsolete cse_tok parameter ?") else: for i in result['results']: lnk = i['clicktrackUrl'] query_part = urlparse(lnk).query query_comps = parse_qs(query_part) lnk = query_comps['q'][0] sc = StaticScrapper(lnk, keywords=keywords, callback=self.parse_page_content, requested_by=self.requested_by) sc.start() def parse_page_content(self, url, page_content, keywords): out_text = [] soup = BeautifulSoup(page_content, "lxml") content_p = soup.find_all('div', {'itemprop': ['articleBody']}) # print("Diplomat: found {} article elem".format(len(content_p))) for maincnt in content_p: for parag in maincnt.find_all('p'): pt = parag.get_text() out_text.append(pt) # print("read {} chars on {}".format(len(''.join(out_text)), url)) self.dbf.add_record(keywords, url, ''.join(out_text), lang=self.lang)
def __init__(self, keywords, url=None, requested_by=None, run=True): self.keywords = keywords self.url = url self.requested_by = requested_by if run: Thread.__init__(self) self.dbf = DBFace() if requested_by is not None and callable(requested_by): requested_by(self)
class GraphBolt(Bolt): outputs = ['info', 'graph_json'] def initialize(self, conf, ctx): self.pid = os.getpid() self.db = DBFace() def process(self, tup): info = tup.values[0] wordcounts = tup.values[1] graph = GlobalGraph(wordcounts, logger=self.logger) graph_json = graph.to_json() self.db.insert_graph(graph_json) self.logger.info("graph inserted in db") self.emit([info, graph_json])
def __init__(self, url, keywords, requested_by=None): self.requested_by = requested_by url_args = { 'key': "AIzaSyCVAXiUzRYsML1Pv6RwSG1gunmMikTzQqY", 'rsz': "filtered_cse", 'num': 10, 'hl': "en", 'prettyPrint': "false", 'source': "gcsc", 'gss': ".com", 'sig': "d5630e36052d1355ead71530c29be9ea", 'cx': "006972344228181832854:w07k6emi2wk", 'cse_tok': "ABPF6HibCVLLP6-x8toeGUn5PJY3CrbCXw:1526812940946", "q": keywords } super().__init__(url, keywords, url_args, callback=self.parse_search_result, requested_by=requested_by) self.lang = "en" self.dbf = DBFace()
def __init__(self, url, keywords=None, url_args=None, callback=None, requested_by=None): Thread.__init__(self) # assert callable(callback) is True or callback is None self.url = url self.request_url = '' self.callback = callback self.url_args = url_args self.keywords = keywords self.dbf = DBFace() self.requested_by = requested_by if requested_by is not None and callable(requested_by): requested_by(self)
def __init__(self, url, keywords=None, url_args=None, callback=None, js=True, requested_by=None): Thread.__init__(self) self.requested_by = requested_by assert callable(callback) is True or callback is None self.url = url self.request_url = '' self.callback = callback self.url_args = url_args self.keywords = keywords self.js = js # if true, use the selenium framework to get content generated from js self.dbf = DBFace() if requested_by is not None and callable(requested_by): requested_by(self)
class FigaroStaticScrapper(StaticScrapper): def __init__(self, url, keywords, requested_by): self.requested_by = requested_by url = url + quote(keywords) super().__init__(url, keywords, '', callback=self.parse_search_result, requested_by=requested_by) self.lang = "fr" self.dbf = DBFace() def parse_search_result(self, url, page_content, keywords): # print("figaro received {}".format(len(page_content))) soup = BeautifulSoup(page_content, "lxml") resdivs = soup.find_all('section', {'class': ['fig-profil',\ 'fig-profil-mtpd',\ 'fig-profil-std',\ 'univers-figaro-vox']}) print("found {} results on figaro".format(len(resdivs))) for i in resdivs: lnk = i.find_all('a')[0].get('href') sc = StaticScrapper(lnk, keywords=keywords, callback=self.parse_page_content, requested_by=self.requested_by) sc.start() def parse_page_content(self, url, page_content, keywords): out_text = [] soup = BeautifulSoup(page_content, "lxml") content_p = soup.find_all('div', {'class': 'fig-content__body'}) if len(content_p) == 0: # si pages sport content_p = soup.find_all('div', {'class': 's24-art-body'}) if len(content_p) == 0: # si pages 'le particulier' content_p = soup.find_all('div', {'class': ['wysiwyg', 'classic']}) if len(content_p) == 0: # si pages 'vin' content_p = soup.find_all('div', {'id': 'content-text'}) if len(content_p) == 0: # pages 'figaro madame" content_p = soup.find_all('div', {'class': ['article-body',\ 'mad__article__content__body',\ 'selectionShareable']}) if len(content_p) == 0: # pages 'economie' content_p = soup.find_all( 'div', {'class': 'texte'}) # marche pas car len > 0 if len(content_p) == 0: # pages l'etudiant content_p = soup.find_all('div', {'class': 'article__content'}) for maincnt in content_p: for parag in maincnt.find_all('p'): # print(parag.get_text()) out_text.append(parag.get_text()) # print("read {} chars on {}".format(len(''.join(out_text)), url)) self.dbf.add_record(keywords, url, ''.join(out_text), lang=self.lang)
def initialize(self, conf, ctx): self.pid = os.getpid() self.db = DBFace()
import sys from utils.DBFace import DBFace import utils.Wordcount_methods as wcm import utils.Graph as graph if __name__ == '__main__': print("running on Python version {}".format(sys.version)) keywords = "kim jong" # si argument fourni en ligne de commande if len(sys.argv) > 1: keywords = ' '.join(sys.argv[1:]) congruence = Congruence() congruence.recursive_search(keywords, keywords, conf.RECURSIVE_DEPTH, langs=['en']) dbf = DBFace() wordcounts = dbf.get_wordcounts(keywords) global_wordcount = wcm.aggregate_wordcount_dicts(wordcounts) # print(global_wordcount) g = graph.GlobalGraph(wordcounts, n=6) #print(g.to_json()) g.to_dot()
class StaticScrapper(Thread): def __init__(self, keywords, url=None, requested_by=None, run=True): self.keywords = keywords self.url = url self.requested_by = requested_by if run: Thread.__init__(self) self.dbf = DBFace() if requested_by is not None and callable(requested_by): requested_by(self) def search(self): #url_args = self.search_args search_params = self.make_search_params(self.keywords) page_content = self.fetch_url(search_params[0], search_params[1]) links = self.parse_search_page(page_content) print("found links : {}".format(links)) for lnk in links: sc = self.__class__(keywords=self.keywords, url=lnk, requested_by=self.requested_by) sc.start() def content_to_db(self): page_content = self.fetch_url(self.url) paragraphs = self.parse_page_content(page_content) # print("read {} chars on {}".format(len(''.join(out_text)), url)) self.dbf.add_record(self.keywords, self.url, paragraphs, lang=self.lang) @classmethod def get_search_results(cls, keywords): search_params = cls.make_search_params(keywords) search_page = cls.fetch_url(search_params[0], search_params[1]) search_results = cls.parse_search_page(search_page) return search_results @classmethod def get_scrap_results(cls, url): article_raw = cls.fetch_url(url) article_content = cls.parse_page_content(article_raw) return article_content @staticmethod def fetch_url(url, url_args=None): try: encoded_args = urlencode(url_args) request_url = url + encoded_args except TypeError as te: request_url = url pass try: print("requesting url : ", request_url) r = requests.get(request_url) return r.text except requests.exceptions.ConnectionError as ce: print(ce) except urllib3.exceptions.MaxRetryError as mre: print(mre) def run(self): if self.url is None: self.search() else: self.content_to_db()
def __init__(self): self.threads = [] self.keywords = '' self.dbf = DBFace()
class Congruence: def __init__(self): self.threads = [] self.keywords = '' self.dbf = DBFace() def run(self, keywords): print("running Congruence with keyword {}".format(keywords)) self.keywords = keywords self.recursive_search(self.keywords, self.keywords, 1, langs=['en']) wordcounts = self.dbf.get_wordcounts(self.keywords) self.g = graph.GlobalGraph(wordcounts, n=6) lgg = self.g.to_json() self.dbf.insert_graph(lgg) g = self.dbf.get_graph() # jstr = json.dumps(g) # return str(g) return g def get_db(self): return self.dbf def thread_accumulator(self, thread): # print("started thread {}".format(thread)) self.threads.append(thread) def run_scrappers(self, keywords, langs): if 'fr' in langs: if conf.SCRAPPERS_VERSION == 1: ns = NouvelobsStaticScrapper( "https://recherche.nouvelobs.com/?", keywords, self.thread_accumulator) ls = LiberationStaticScrapper( "http://www.liberation.fr/recherche/?", keywords, self.thread_accumulator) fs = FigaroStaticScrapper( "http://recherche.lefigaro.fr/recherche/", keywords, self.thread_accumulator) else: ns = NouvelobsStaticScrapper( keywords, requested_by=self.thread_accumulator) ls = LiberationStaticScrapper( keywords, requested_by=self.thread_accumulator) fs = FigaroStaticScrapper(keywords, requested_by=self.thread_accumulator) ls.start() ns.start() fs.start() if 'en' in langs: if conf.SCRAPPERS_VERSION == 1: nys = NYTScrapper("https://www.nytimes.com/search/", keywords, self.thread_accumulator) bbs = BBCScrapper("https://www.bbc.co.uk/search?", keywords, self.thread_accumulator) # cnn = CNNScrapper("https://edition.cnn.com/search/?", keywords, self.thread_accumulator) dps = DiplomatScrapper( 'https://www.googleapis.com/customsearch/v1element?', keywords, self.thread_accumulator) else: nys = NYTScrapper(keywords, requested_by=self.thread_accumulator) bbs = BBCScrapper(keywords, requested_by=self.thread_accumulator) # cnn = CNNScrapper(keywords, self.thread_accumulator) dps = DiplomatScrapper(keywords, requested_by=self.thread_accumulator) # tis = TheInterceptScrapper(keywords, requested_by=self.thread_accumulator) nys.start() bbs.start() # cnn.start() dps.start() #tis.start() for t in self.threads: t.join() def recursive_search(self, initial_keywords, current_keywords, depth, langs=['en']): if depth == 0: return None analyser = Analyser(conf.NLP_HOST, conf.NLP_PORT) print( "running recursive search at depth {} for keyword {} from initial keyword {}" .format(depth, current_keywords, initial_keywords)) self.run_scrappers(current_keywords, langs=['en']) fwst = self.dbf.find_with_search_term(current_keywords) print("found {} document{} originating from keyword {}".format( len(fwst), '' if len(fwst) <= 1 else 's', current_keywords)) fwc = self.dbf.find_with_content(self.keywords, exact=True) print("found {} document{} containing text {}".format( len(fwc), '' if len(fwc) <= 1 else 's', current_keywords)) notk = self.dbf.find_tokenifiable(langs=["en"]) nowc = self.dbf.find_wordcountable(langs=["en"]) print("found {} tokenifiable doc{}".format( notk.count(), '' if notk.count() == 1 else 's')) print("found {} wordcountable doc{}".format( nowc.count(), '' if nowc.count() == 1 else 's')) self.dbf.batch_tokenify(notk, analyser) wordcounts = self.dbf.get_wordcounts(current_keywords) global_wordcount = wcm.aggregate_wordcount_dicts(wordcounts) # print(global_wordcount) global_wordcount_dict_best = { k: wcm.take_firsts(v, n=3) for k, v in global_wordcount.items() if k in ["PERSON", "ORGANIZATION"] } global_wordcount_best = wcm.aggregate_subjects( global_wordcount_dict_best) for token in global_wordcount_best: self.recursive_search(initial_keywords, token[0][0], depth - 1, langs) if __name__ == '__main__': print("running on Python version {}".format(sys.version))