def search(): try: searches = db.searches articles = db.articles searchresults = db.searchresults searchstatus = db.searchstatus text = sys.argv[1] print(text) search_text = preprocess(text) search_text_tokens = search_text.split(' ') dic = {"preprocessed_text": search_text} search_v_w2v = get_word2vec(dic) search_v_tfidf = get_tfidt_vector(dic) result_w2v_list = [] result_tfidf_list = [] result_exact_list = [] now = datetime.datetime.now() count = 0 for a in articles.find( {"timestamp": { "$gt": now.timestamp() - 86400.0 }}): # count = 0 # for a in articles.find(): count += 1 if not np.all(search_v_w2v == 0): if not np.all(np.array(a["w2v"]) == 0): if similarity(np.array(a["w2v"]), search_v_w2v) > 0.3: result_w2v_list.append(a) if not np.all(np.array(a["tfidf"]) == 0): if similarity(np.array(a["tfidf"]), search_v_tfidf) > 0.0: result_tfidf_list.append(a) # for token in search_text_tokens: # if token in a["text"]: # result_exact_list.append(a) # break # print("num of documents checked : ") # print(count) # searchresults.delete_many({}) # # mydict = {"search_text": text, "result": result_w2v_list, "type": "w2v"} # searchresults.insert_one(mydict) # # mydict = {"search_text": text, "result": result_tfidf_list, "type": "tfidf"} # searchresults.insert_one(mydict) # # mydict = {"search_text": text, "result": result_exact_list, "type": "exact"} # searchresults.insert_one(mydict) # # mydict = {"status": "done"} # searchstatus.insert_one(mydict) print(len(result_tfidf_list)) print(len(result_w2v_list)) sys.stdout.flush() except (Exception): print("exception occured") sys.stdout.flush()
def add_vector_for_old_news(): client = MongoClient() db = client['newsdb'] articles = db.articles # count = 0 # b = {} # a = {"id":1} # b = a # print(b) # b["text"] = "salam" # print(b) all_news_list = [] count_of_ah = 0 for a in articles.find(): b = a # if "title" in a : # b["preprocessed_title"] = preprocess(a["title"]) if "summary" in a: b["preprocessed_summary"] = preprocess(a["summary"]) w2v_vector = get_word2vec(b).tolist() tfidf_vector = get_tfidt_vector(b).tolist() b["w2v"] = w2v_vector b["tfidf"] = tfidf_vector all_news_list.append(b) print("done") print("first step done") articles.delete_many({}) print("second step done") for dic in all_news_list: articles.insert_one(dic) print("third step done")
def parse(self, response): dic = {"title":" ", "timestamp": "", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " "} title = response.xpath('//h1[@class="title"]/text()').get() dic["title"] = title news_url = response.xpath('//a[@id="short-link"]/text()').get() dic["url"] = " ".join(news_url.split()) sections = [] dic["article_section"] = sections summary = response.xpath('//h3[@class="lead"]/text()').get() dic["summary"] = summary date = response.xpath('//li[@class="time"]/text()').get() date_list = date.split(' ') timelist = date_list[5].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) day = convert_persian_to_english_numbers(date_list[1]) month = month_dic[date_list[2]] year = convert_persian_to_english_numbers(date_list[3]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = "no code" dic["code"] = code tags = [] dic["tags"] = tags text_list = response.xpath('//div[@class="story"]/p/text()').getall() text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): HtmlResponse = response # resfile = open('resfile_specific.html', 'w') # resfile.write(str(HtmlResponse.body.decode('utf-8'))) # resfile.close() dic = { "title": " ", "timestamp": "", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//h1[@class="title"]/a/text()').get() dic["title"] = title dic["preprocessed_title"] = preprocess(dic["title"]) news_url = response.css('h1[class=title] a::attr(href)').extract()[0] dic["url"] = "http://titrnews.ir" + news_url sections = response.xpath( '//div[@class="news_path"]/a/text()').getall() dic["article_section"] = sections summary = response.xpath('//div[@class="subtitle"]/text()').get() dic["summary"] = summary dic["preprocessed_summary"] = preprocess(dic["summary"]) date = response.xpath( '//div[@class="news_nav news_pdate_c"]/text()').get() date_list = date.split(' ') # print(date_list) timelist = date_list[4].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) # print("hour") # print(hour) # print("minute") # print(minute) # # day = convert_persian_to_english_numbers(date_list[0]) # print("day") # print(day) # month = month_dic[date_list[1]] # print("month") # print(month) # # year = convert_persian_to_english_numbers(date_list[2]) # print("year") # print(year) # jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() # print(jalili_date) datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) # print(datetime_object) # dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath( '//div[@class="news_nav news_id_c"]/text()').get() dic["code"] = code tags = response.xpath('//div[@class="tags_title"]/a/text()').getall() dic["tags"] = tags text_list = response.xpath( '//div[@class="body"]/p/span/text()').getall() text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): dic = {"title":" ", "timestamp": " ", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " "} title = response.xpath('//span[@class="title mb-2 d-block text-justify"]/text()').get() dic["title"] = title news_url = response.css('link[rel=canonical]::attr(href)').extract()[0] dic["url"] = news_url sections = response.xpath('//div[@class="category-name d-flex justify-content-center"]/span/a/text()').getall() final = [] for s in sections: processed_text = " ".join(s.split()) final.append(processed_text) dic["article_section"] = final summary = response.xpath('//p[@class="lead p-2 text-justify"]/text()').get() dic["summary"] = summary date = response.xpath('//div[@class="publish-time d-flex justify-content-center"]/span/text()').getall() if len(date) > 1: timelist = date[0].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) date_list = date[2].split('/') day = convert_persian_to_english_numbers(date_list[2]) month = convert_persian_to_english_numbers(date_list[1]) yearlist = date_list[0].split('،') year = convert_persian_to_english_numbers(yearlist[0]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) else: date = response.xpath('//span[@class="publish-time text-center"]/text()').get() date_list = date.split(' ') timelist = date_list[2].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) d_list = date_list[0].split('/') day = convert_persian_to_english_numbers(d_list[2]) month = convert_persian_to_english_numbers(d_list[1]) year = convert_persian_to_english_numbers(d_list[0]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = datetime_object # # dic["timestamp"] = datetime_object.timestamp() code = '' dic["code"] = code tags = response.xpath('//div[@class="tags mt-4 text-right d-flex flex-wrap"]/a/text()').getall() finaltags = [] for t in tags: processed_text = " ".join(t.split()) finaltags.append(processed_text) dic["tags"] = finaltags # text_list = response.xpath('//div[@class="nt-body text-right mt-4"]/p/text()').getall() text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): dic = { "timestamp": "", "title": " ", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//h1[@class="title"]/a/text()').get() dic["title"] = title dic["preprocessed_title"] = preprocess(dic["title"]) news_url = response.css('h1[class=title] a::attr(href)').extract() if len(news_url) > 0: news_url = news_url[0] dic["url"] = "https://www.hamshahrionline.ir" + news_url sections = response.xpath( '//li[@class="breadcrumb-item"]/a/text()').getall() dic["article_section"] = sections[1:] summary = response.xpath('//p[@class="introtext"]/text()').get() dic["summary"] = summary dic["preprocessed_summary"] = preprocess(dic["summary"]) date = response.xpath( '//div[@class="col-6 col-sm-4 col-xl-4 item-date"]/span/text()' ).get() list = date.split(' ') # print(list) day = convert_persian_to_english_numbers(list[1]) month = month_dic[list[2]] year = convert_persian_to_english_numbers(list[3]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() time = list[5] # print(convert_persian_to_english_numbers(day)) # print(month_dic[month]) # print(convert_persian_to_english_numbers(year)) list_time = time.split(':') hour = convert_persian_to_english_numbers(list_time[0]) minute = convert_persian_to_english_numbers(list_time[1]) # print(convert_persian_to_english_numbers(hour)) # print(convert_persian_to_english_numbers(minute)) datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath('//div[@class="item-code"]/span/text()').get() dic["code"] = code tags = response.xpath( '//section[@class="box tags"]/div/ul/li/a/text()').getall() dic["tags"] = tags text_parts = response.xpath( '//div[@class="item-text"]/p/text()').getall() text = "" for text_part in text_parts: text += text_part dic["text"] = text dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): dic = {"title": " "} title = response.xpath( '//h1[@class="title h_title_news"]/a/text()').get() dic["title"] = title try: news_url = response.css( 'h1.h_title_news a::attr(href)').extract()[0] except (Exception): news_url = response.css('h1.Htags a::attr(href)').extract()[0] dic["url"] = "https://namehnews.com" + news_url # news_path sections = response.xpath( '//div[@class="news_path"]/a/text()').getall() dic["article_section"] = sections summary = response.xpath( '//div[@class="subtitle sub_news"]/text()').get() dic["summary"] = summary date = response.xpath( '//div[@class="news_nav news_pdate_c col-xs-36 col-sm-14 pull-left"]/text()' ).getall() date = " ".join(date[1].split()) date_list = date.split(' ') timelist = date_list[4].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) day = convert_persian_to_english_numbers(date_list[0]) month = month_dic[date_list[1]] year = convert_persian_to_english_numbers(date_list[2]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath( '//div[@class="news_nav news_id_c col-xs-36 col-sm-11"]/text()' ).getall() code = " ".join(code[1].split()) dic["code"] = code tags = response.xpath('//div[@class="tags_title"]/a/text()').getall() dic["tags"] = tags text_list = response.xpath( '//div[@class="body body_news"]/div/text()').getall() text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): dic = { "title": " ", "timestamp": " ", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//h1[@class="title"]/a/text()').get() dic["title"] = title news_url = response.css('h1[class=title] a::attr(href)').extract()[0] dic["url"] = "https://www.mehrnews.com" + news_url sections = response.xpath( '//div[@class="col-6 col-sm-4"]/ol[@class="breadcrumb"]/li/a/text()' ).getall() dic["article_section"] = sections summary = response.xpath( '//p[@class="summary introtext"]/text()').get() dic["summary"] = summary date = response.xpath( '//div[@class="col-6 col-sm-4 item-date"]/span/text()').get() date_list = date.split(' ') # print(date_list) timelist = date_list[4].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) # print("hour") # print(hour) # print("minute") # print(minute) # day = convert_persian_to_english_numbers(date_list[0]) # print("day") # print(day) # month = month_dic[date_list[1]] # print("month") # print(month) # yearlist = date_list[2].split('،') # print(yearlist) year = convert_persian_to_english_numbers(yearlist[0]) # print("year") # print(year) # # # jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() # print(jalili_date) datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) # print(datetime_object) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath('//div[@class="item-code"]/span/text()').get() dic["code"] = code tags = response.xpath( '//section[@class="box tags"]/div/ul/li/a/text()').getall() dic["tags"] = tags text_list = response.xpath( '//div[@class="item-text"]/p/text()').getall() text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def search(): client = MongoClient() src_db = client['newsdb'] articles = src_db.articles dst_db = client['webdb'] searchresults = dst_db.searchresults # text = searches.find().sort("_id", -1)[0]["text"] text = sys.argv[1] # print("this is the text that python received") # print(text) search_text = preprocess(text) search_text_tokens = search_text.split(' ') # print("ok 1") dic = {"preprocessed_text": search_text} search_v_w2v = get_word2vec(dic) # print("ok 2") search_v_tfidf = get_tfidt_vector(dic) # print("ok 3") now = datetime.datetime.now() result_w2v_list = [] result_tfidf_list = [] result_exact_list = [] # for a in articles.find({"timestamp": {"$gt": now.timestamp() - 86400.0}}): count = 0 count_tokens = 0 for a in articles.find({"timestamp": {"$gt": now.timestamp() - 86400.0}}): count += 1 if not np.all(search_v_w2v == 0): if not np.all(np.array(a["w2v"]) == 0): if similarity(np.array(a["w2v"]), search_v_w2v) > 0.8: result_w2v_list.append(a) if not np.all(np.array(a["tfidf"]) == 0): if similarity(np.array(a["tfidf"]), search_v_tfidf) > 0.7: result_tfidf_list.append(a) # for token in search_text_tokens: # if token in a["text"]: # count_tokens += 1 # if count_tokens == len(search_text_tokens): # result_exact_list.append(a) # print("ok 4") searchresults.delete_many({}) # print("ok 5") mydict = {"search_text": text, "result": result_w2v_list, "type": "w2v"} searchresults.insert_one(mydict) # print("ok 6") mydict = {"search_text": text, "result": result_tfidf_list, "type": "tfidf"} searchresults.insert_one(mydict) # print("ok 7") # mydict = {"search_text": text, "result": result_exact_list, "type": "exact"} # searchresults.insert_one(mydict) # print("ok 8") print("ok") sys.stdout.flush()
def parse(self, response): dic = {"title": " "} title = response.xpath( '//h1[@class="title title_news"]/span/text()').get() dic["title"] = title news_url = response.css('h1.title_news span::attr(href)').extract()[0] dic["url"] = "https://www.iribnews.ir" + news_url sections = response.xpath( '//div[@class="news_path"]/a/text()').getall() dic["article_section"] = sections summary = response.xpath('//p[@class="subtitle"]/text()').get() dic["summary"] = summary date = response.xpath( '//div[@class="news_nav news_pdate_c col-sm-16 col-xs-25"]/text()' ).getall() # dic["date"] = date # print("date:") # print(block["date"][1]) date = date[1] date_list = date.split(' ') # print(date_list) timelist = date_list[21].split(':') hour = convert_persian_to_english_numbers(timelist[0]) # print(hour) minute = convert_persian_to_english_numbers(timelist[1]) # print(minute) # day = convert_persian_to_english_numbers(date_list[17]) # print(day) month = month_dic[date_list[18]] # print(month) year = convert_persian_to_english_numbers(date_list[19]) # print(year) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) # print(datetime_object) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath( '//div[@class="news_nav news_id_c col-sm-10 col-xs-11"]/text()' ).getall() code = " ".join(code[1].split()) dic["code"] = code tags = response.xpath('//div[@class="tags_title"]/a/text()').getall() dic["tags"] = tags text_list = response.xpath( '//div[@class="body body_media_content_show"]/div/text()').getall( ) text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): dic = { "timestamp": "", "title": " ", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//h1[@class="Htags"]/a/text()').get() dic["title"] = title dic["preprocessed_title"] = preprocess(dic["title"]) news_url = response.css('h1[class=Htags] a::attr(href)').extract() if len(news_url) > 0: news_url = news_url[0] dic["url"] = "https://www.yjc.ir" + news_url sections = response.xpath( '//div[@class="news_path"]/a/text()').getall() dic["article_section"] = sections summary = response.xpath('//strong[@class="news_strong"]/text()').get() dic["summary"] = summary dic["preprocessed_summary"] = preprocess(dic["summary"]) date_list = response.xpath( '//div[@class="news_nav news_pdate_c"]/text()').getall() date = "" for d in date_list: date += d list = date.split(' ') # print(list) day = convert_persian_to_english_numbers(list[0]) month = month_dic[list[1]] year = convert_persian_to_english_numbers(list[2]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() time = list[4] # print(convert_persian_to_english_numbers(day)) # print(month_dic[month]) # print(convert_persian_to_english_numbers(year)) list_time = time.split(':') hour = convert_persian_to_english_numbers(list_time[0]) minute = convert_persian_to_english_numbers(list_time[1]) # print(convert_persian_to_english_numbers(hour)) # print(convert_persian_to_english_numbers(minute)) datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code_list = response.xpath( '//div[@class="news_nav news_id_c"]/text()').getall() code = "" for c in code_list: code += c dic["code"] = code tags = response.xpath('//div[@class="tag_items"]/a/text()').getall() dic["tags"] = tags text_parts = response.xpath('//div[@class="body"]/p/text()').getall() text = "" for text_part in text_parts: text += text_part dic["text"] = text dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): HtmlResponse = response # resfile = open('resfile_specific.html', 'w') # resfile.write(str(HtmlResponse.body.decode('utf-8'))) # resfile.close() dic = { "title": " ", "timestamp": "", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//h1[@id="docDiv3TitrMain"]/text()').get() dic["title"] = title dic["preprocessed_title"] = preprocess(dic["title"]) news_url = response.css('a[class=print-icon]::attr(href)').extract()[0] dic["url"] = "http://www.akhbarbank.com" + news_url.replace( "/print", "") sections = response.xpath( '//div[@class="dsinfo-p1-active"]/a/text()').getall() dic["article_section"] = sections summary = response.xpath('//div[@id="docDivLead3"]/div/text()').get() dic["summary"] = summary dic["preprocessed_summary"] = preprocess(dic["summary"]) date = response.xpath('//div[@id="docDiv3Date"]/text()').get() date_list = date.split(' ') # print(date_list) timelist = date_list[5].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) # print("hout") # print(hour) # print("minute") # print(minute) day = convert_persian_to_english_numbers(date_list[1]) # print("day") # print(day) month = month_dic[date_list[2]] # print("month") # print(month) year = convert_persian_to_english_numbers(date_list[3]) # print("year") # print(year) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() # print(jalili_date) datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) # print(datetime_object) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath('//div[@id="docDocID"]/text()').get() dic["code"] = code tags = [] dic["tags"] = tags text_list = response.xpath('//div[@id="doctextarea"]/text()').getall() text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): HtmlResponse = response # resfile = open('resfile_specific.html', 'w') # resfile.write(str(HtmlResponse.body.decode('utf-8'))) # resfile.close() dic = { "title": " ", "timestamp": "", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//h1[@class="title"]/a/text()').get() dic["title"] = title dic["preprocessed_title"] = preprocess(dic["title"]) news_url = response.css('h1[class=title] a::attr(href)').extract() if len(news_url) > 0: news_url = news_url[0] dic["url"] = "http://sobhanehonline.com" + news_url sections = response.xpath( '//div[@class="news_path"]/a/text()').getall() if len(sections) > 0: dic["article_section"] = sections[0] summary = response.xpath('//div[@class="subtitle"]/text()').get() dic["summary"] = summary dic["preprocessed_summary"] = preprocess(dic["summary"]) date = response.xpath( '//div[@class="news_nav news_pdate_c"]/text()').get() list = date.split(' ') # print(list) day = convert_persian_to_english_numbers(list[0]) month = month_dic[list[1]] year = convert_persian_to_english_numbers(list[2]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() time = list[4] # print(convert_persian_to_english_numbers(day)) # print(month_dic[month]) # print(convert_persian_to_english_numbers(year)) list_time = time.split(':') hour = convert_persian_to_english_numbers(list_time[0]) minute = convert_persian_to_english_numbers(list_time[1]) # print(convert_persian_to_english_numbers(hour)) # print(convert_persian_to_english_numbers(minute)) datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath( '//div[@class="news_nav news_id_c"]/text()').get() dic["code"] = code tags = response.xpath('//a[@class="tags_item"]/text()').getall() dic["tags"] = tags text_parts = response.xpath('//div[@align="justify"]/text()').getall() text = "" for text_part in text_parts: text += text_part if (len(text) < 1): maybe_text = response.xpath('//div[@class="body"]/text()').getall() for t in maybe_text: text += t maybe_p = response.xpath('//div[@class="body"]/p/text()').getall() for p in maybe_p: text += p dic["text"] = text dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb'] articles = db.weekarticles result = articles.insert_one(dic)
def lstm(params): data, count, dictionary, embeddings, normalized_embeddings, weights, biases = word2vec.get_word2vec( 2, False) words_size = embeddings.shape[0] embedding_size = embeddings.shape[1] print('Most common words (+UNK)', count[:5]) print('embedding size:%s data:%s' % (embedding_size, [dictionary[word] for word in data[:100]])) # Create a small validation set. valid_size = 1000 valid_text = data[:valid_size] train_text = data[valid_size:] train_size = len(train_text) p_num_unrollings = params['num_unrollings'] p_batch_size = params['batch_size'] class BatchGenerator(object): def __init__(self, text, batch_size, num_unrollings): assert batch_size >= 1 assert num_unrollings >= 1 self._text = text self._text_size = len(text) self._batch_size = batch_size self._num_unrollings = num_unrollings segment = self._text_size // batch_size self._cursor_boundary = [ offset * segment for offset in range(batch_size) ] self._cursor = self._cursor_boundary[:] self._last_batch = self._next_batch() def _next_batch(self): """Generate a single batch from the current cursor position in the data.""" batch = np.zeros(shape=(self._batch_size, embedding_size), dtype=np.float) for b in range(self._batch_size): batch[b] = normalized_embeddings[self._text[self._cursor[b]]] self._cursor[b] = (self._cursor[b] + 1) if self._cursor[self._batch_size - 1] == self._text_size: self._cursor = self._cursor_boundary[:] return batch def next(self): """Generate the next array of batches from the data. The array consists of the last batch of the previous array, followed by p_num_unrollings new ones. """ batches = [self._last_batch] for _ in range(self._num_unrollings): batches.append(self._next_batch()) self._last_batch = batches[-1] return batches def batches2string(batches): """Convert a sequence of batches back into their (most likely) string representation.""" s = [''] * batches[0].shape[0] for b in batches: words = [ dictionary[w] for w in np.argmax(np.matmul(b, normalized_embeddings.T), 1) ] s = [' '.join(x) for x in zip(s, words)] return s train_batches = BatchGenerator(train_text, p_batch_size, p_num_unrollings) valid_batches = BatchGenerator(valid_text, 1, 1) print(batches2string(train_batches.next())) print(batches2string(train_batches.next())) print(batches2string(train_batches.next())) print(batches2string(valid_batches.next())) print(batches2string(valid_batches.next())) print(batches2string(valid_batches.next())) def logprob(predictions, labels): """Log-probability of the true labels in a predicted batch.""" predictions[predictions < 1e-10] = 1e-10 return np.sum( -np.log([predictions[i, label] for i, label in enumerate(labels)])) / labels.shape[0] graph = tf.Graph() with graph.as_default(): p_num_nodes = params['num_nodes'] p_max_k = params['max_k'] def create_trainable_variables(): ''' Parameters: num_nodes*0:num_nodes*1 : Input gate num_nodes*1:num_nodes*2 : Forget gate num_nodes*2:num_nodes*3 : Output gate num_nodes*3:num_nodes*4 : New memory cell ''' W = { 'L1_W': tf.Variable( tf.truncated_normal([embedding_size, p_num_nodes * 4], mean=0, stddev=0.1, name="L1_W")), 'L1_U': tf.Variable( tf.truncated_normal([p_num_nodes, p_num_nodes * 4], mean=0, stddev=0.1, name="L1_U")), 'L1_b': tf.Variable(tf.zeros([1, p_num_nodes * 4]), name="L1_b"), 'L2_W': tf.Variable( tf.truncated_normal([p_num_nodes, p_num_nodes * 4], mean=0, stddev=0.1, name="L2_W")), 'L2_U': tf.Variable( tf.truncated_normal([p_num_nodes, p_num_nodes * 4], mean=0, stddev=0.1, name="L2_U")), 'L2_b': tf.Variable(tf.zeros([1, p_num_nodes * 4]), name="L2_b"), 'L3_W': tf.Variable( tf.truncated_normal([p_num_nodes, p_num_nodes * 4], mean=0, stddev=0.1, name="L3_W")), 'L3_U': tf.Variable( tf.truncated_normal([p_num_nodes, p_num_nodes * 4], mean=0, stddev=0.1, name="L3_U")), 'L3_b': tf.Variable(tf.zeros([1, p_num_nodes * 4]), name="L3_b"), 'L4_W': tf.Variable( tf.truncated_normal([p_num_nodes, embedding_size], mean=0, stddev=0.1, name="L4_W")), 'L4_b': tf.Variable(tf.zeros([embedding_size]), name="L4_b"), } return W def create_variables(batch_size, num_unrollings): # Input data. train_data = list() for _ in range(num_unrollings + 1): train_data.append( tf.placeholder(tf.float32, shape=[batch_size, embedding_size])) inputs = { 'inputs': train_data[:num_unrollings], 'labels': train_data[1:], # labels are inputs shifted by one time step. 'data': train_data, 'dropout': tf.placeholder(tf.float32, name="dropout"), } # Variables saving state across unrollings. last_state = { 'h1': tf.Variable(tf.zeros([batch_size, p_num_nodes]), trainable=False, name="h1"), 'c1': tf.Variable(tf.zeros([batch_size, p_num_nodes]), trainable=False, name="c1"), 'h2': tf.Variable(tf.zeros([batch_size, p_num_nodes]), trainable=False, name="h2"), 'c2': tf.Variable(tf.zeros([batch_size, p_num_nodes]), trainable=False, name="c2"), 'h3': tf.Variable(tf.zeros([batch_size, p_num_nodes]), trainable=False, name="h3"), 'c3': tf.Variable(tf.zeros([batch_size, p_num_nodes]), trainable=False, name="c3"), } return inputs, last_state # Definition of the cell computation. def lstm_cell(x, h, c, W, U, b): """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf Note that in this formulation, we omit the various connections between the previous c (i.e. state) and the gates.""" raw_data = tf.matmul(x, W) + tf.matmul(h, U) + b gates = tf.sigmoid(raw_data[:, :p_num_nodes * 3]) input_gate = gates[:, :p_num_nodes] # p_batch_size x p_num_nodes forget_gate = gates[:, p_num_nodes:p_num_nodes * 2] # p_batch_size x p_num_nodes output_gate = gates[:, p_num_nodes * 2:p_num_nodes * 3] # p_batch_size x p_num_nodes new_memory_cell = raw_data[:, p_num_nodes * 3:] # p_batch_size x p_num_nodes c_next = forget_gate * c + input_gate * tf.tanh( new_memory_cell) # p_batch_size x p_num_nodes h_next = output_gate * tf.tanh(c_next) return h_next, c_next def create_model(W, inputs, last_state): ys = list() h1 = last_state['h1'] c1 = last_state['c1'] h2 = last_state['h2'] c2 = last_state['c2'] h3 = last_state['h3'] c3 = last_state['c3'] # construct 2 layer LSTM for x in inputs['inputs']: h1, c1 = lstm_cell(x, h1, c1, W['L1_W'], W['L1_U'], W['L1_b']) x2 = tf.nn.dropout(h1, inputs['dropout'], name="dropout") h2, c2 = lstm_cell(x2, h2, c2, W['L2_W'], W['L2_U'], W['L2_b']) x3 = tf.nn.dropout(h2, inputs['dropout'], name="dropout") h3, c3 = lstm_cell(x3, h3, c3, W['L3_W'], W['L3_U'], W['L3_b']) ys.append(h3) # State saving across unrollings. with tf.control_dependencies([ last_state['h1'].assign(h1), last_state['c1'].assign(c1), last_state['h2'].assign(h2), last_state['c2'].assign(c2), last_state['h3'].assign(h3), last_state['c3'].assign(c3) ]): # Classifier. Y_pred = tf.nn.xw_plus_b(tf.concat(0, ys), W['L4_W'], W['L4_b']) norm = tf.sqrt( tf.reduce_sum(tf.square(Y_pred), 1, keep_dims=True)) normalized_Y_pred = Y_pred / norm Y = tf.concat(0, inputs['labels']) l2_loss = params['beta_regularization_value'] * ( tf.nn.l2_loss(W['L1_W']) + tf.nn.l2_loss(W['L2_W']) + tf.nn.l2_loss(W['L3_W']) + tf.nn.l2_loss(W['L4_W'])) loss = tf.contrib.losses.cosine_distance( normalized_Y_pred, Y, dim=1) + l2_loss model = { 'loss': loss, 'Y_pred': Y_pred, } return model # Convert vec to word norm_embeddings = tf.constant(normalized_embeddings.T) W = create_trainable_variables() inputs, last_state = create_variables(p_batch_size, p_num_unrollings) # Unrolled LSTM loop. model = create_model(W, inputs, last_state) # Optimizer. global_step = tf.Variable(0) learning_rate = tf.train.exponential_decay( params['start_learning_rate'], global_step, 5000, 0.1, staircase=True) optimizer = tf.train.GradientDescentOptimizer(learning_rate) gradients, v = zip(*optimizer.compute_gradients(model['loss'])) gradients, _ = tf.clip_by_global_norm(gradients, 1.25) optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step) grad_sum = [ tf.sqrt(tf.reduce_mean(tf.square(gradient))) for gradient in gradients[:len(gradients) - 2] ] v_sum = [ tf.sqrt(tf.reduce_mean(tf.square(variable))) for variable in v[:len(gradients) - 2] ] grad_v_sum = [grad / v for grad, v in zip(grad_sum, v_sum)] grad_sum_string = tf.Print(grad_sum, [grad_sum], message="grad_sum: ") v_sum_string = tf.Print(v_sum, [v_sum], message="v_sum: ") grad_v_sum_string = tf.Print(grad_v_sum, [grad_v_sum], message="grad_v_sum: ") # Sampling and validation eval: batch 1, no unrolling. sample_batch_size = 1 sample_num_unrollings = 1 sample_inputs, sample_last_state = create_variables( sample_batch_size, sample_num_unrollings) sample_model = create_model(W, sample_inputs, sample_last_state) reset_sample_state = tf.group( sample_last_state['h1'].assign( tf.zeros([sample_batch_size, p_num_nodes])), sample_last_state['c1'].assign( tf.zeros([sample_batch_size, p_num_nodes])), sample_last_state['h2'].assign( tf.zeros([sample_batch_size, p_num_nodes])), sample_last_state['c2'].assign( tf.zeros([sample_batch_size, p_num_nodes])), sample_last_state['h3'].assign( tf.zeros([sample_batch_size, p_num_nodes])), sample_last_state['c3'].assign( tf.zeros([sample_batch_size, p_num_nodes]))) similarity = tf.matmul(sample_model['Y_pred'], norm_embeddings) sample_next = tf.nn.top_k(similarity, p_max_k)[1] # Add ops to save and restore all the variables. saver = tf.train.Saver() p_epochs = params['epochs'] p_summary_frequency = params['summary_frequency'] with tf.Session(graph=graph) as session: tf.initialize_all_variables().run() print('Initialized') if os.path.exists(params['savefile']) and params['resume']: # Restore variables from disk. saver.restore(session, params['savefile']) print("Model restored.") start_time = time.time() n_batch = len(data) // p_batch_size for epoch in range(int(math.ceil(p_epochs))): # p_epochs can be 0.001 to test overfit fraction = p_epochs - epoch if (fraction) < 1: n_batch = n_batch * fraction total_step = int(math.ceil(n_batch)) mean_loss = 0 print("Epoch %s start / total p_epochs %s, total steps %s" % (epoch, p_epochs, total_step)) for step in range(total_step): batches = train_batches.next() inputs_dict = dict() for i in range(p_num_unrollings + 1): inputs_dict[inputs['data'][i]] = batches[i] inputs_dict[inputs['dropout']] = params['dropout'] _, loss_e, learning_rate_e = session.run( [optimizer, model['loss'], learning_rate], feed_dict=inputs_dict) mean_loss += loss_e if step % p_summary_frequency == 0: mean_loss = mean_loss / p_summary_frequency # The mean loss is an estimate of the loss over the last few batches. # PP = exp(CE) = exp(-log(prediction)) = 1/prediction. max PP = 1 / (1/50000) = 50000 print( 'Average loss at step(%d):%f learning rate:%.2f time:%s' % (step, mean_loss, learning_rate_e, timedelta(seconds=(time.time() - start_time)))) mean_loss = 0 def sample(candiate_indices): # check https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py#L62 k = int(abs(random.normalvariate( 0, p_max_k / 2))) % p_max_k index = candiate_indices[k] # Skip UNK if len(candiate_indices) > 1: while index == 0: k = int( abs(random.normalvariate( 0, p_max_k / 2))) % p_max_k index = candiate_indices[k] return index if step % (p_summary_frequency * 10) == 0: # Generate some samples. print('=' * 80) for _ in range(5): word = int( random.uniform(0, 1) * words_size) % words_size feed = np.array([embeddings[word]]) sentence = dictionary[word] reset_sample_state.run() for _ in range(79): prediction = sample_next.eval({ sample_inputs['inputs'][0]: feed, sample_inputs['dropout']: 1, }) index = sample(prediction[0, :]) feed = np.array([embeddings[index]]) sentence += ' ' + dictionary[index] print(sentence) print('=' * 80) # Save the variables to disk. save_path = saver.save(session, params['savefile']) # Measure validation set perplexity. valid_mean_loss = 0 reset_sample_state.run() for _ in range(valid_size): validation_batches = valid_batches.next() sample_feeds = { sample_inputs['inputs'][0]: validation_batches[0], sample_inputs['labels'][0]: validation_batches[1], sample_inputs['dropout']: 1, } valid_loss = session.run([sample_model['loss']], feed_dict=sample_feeds) valid_mean_loss += valid_loss[0] print('Validation set loss: %.2f. saved:%s' % (valid_mean_loss / valid_size, save_path))
def parse(self, response): dic = { "title": " ", "timestamp": " ", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//h1[@class="title"]/text()').get() dic["title"] = title news_url = response.css( "link[rel='shortlink']::attr(href)").extract()[0] dic["url"] = "http://www.rajanews.com" + news_url sections = [] dic["article_section"] = sections summary = response.xpath('//div[@class="lead"]/text()').get() dic["summary"] = summary date = response.xpath('//div[@class="created"]/span/text()').get() date_list = date.split(' ') timelist = date_list[1].split(':') # print(timelist) hour = timelist[0] minute = timelist[1] second = timelist[2] date_list = date_list[0].split('-') # print(date_list) day = date_list[2] month = date_list[1] year = date_list[0] datetime_object = datetime.datetime(int(year), int(month), int(day), int(hour), int(minute), int(second)) # print(datetime_object) dic["date"] = datetime_object dic["timestamp"] = datetime_object.timestamp() code = response.xpath('//div[@class="news-id"]/text()').get() code = processed_text = " ".join(code.split()) code_list = code.split(' ') dic["code"] = code tags = [] dic["tags"] = tags text_list = response.xpath('//div[@class="body"]/div/text()').getall() text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): dic = { "title": " ", "timestamp": " ", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//h1[@class="first-title"]/text()').get() dic["title"] = title news_url = response.css( 'div[class=form-group] input::attr(value)').extract()[0] dic["url"] = news_url meta_news = response.xpath( '//div[@class="meta-news"]/ul/li/span/text()').getall() try: dic["article_section"] = meta_news[3] except (Exception): dic["article_section"] = [] summary = response.xpath('//p[@class="summary"]/text()').get() dic["summary"] = summary try: date = meta_news[1] except (Exception): date = response.xpath('//time/text()').get() date_list = date.split(' ') # print(date_list) timelist = date_list[4].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) # print("hour") # print(hour) # print("minute") # print(minute) day = convert_persian_to_english_numbers(date_list[0]) # print("day") # print(day) # month = month_dic[date_list[1]] # print("month") # print(month) # year = convert_persian_to_english_numbers(date_list[2]) # print("year") # print(year) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() # print(jalili_date) datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) # print(datetime_object) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() try: dic["code"] = meta_news[5] except: dic["code"] = '' tags = response.xpath( '//footer[@class="tags"]/ul/li/a/text()').getall() dic["tags"] = tags text_list = response.xpath( '//div[@class="item-text"]/p/text()').getall() text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): item_body_SELECTOR = '.item-body' text = " " dic = { "timestamp": "", "url": " ", "title": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } news_url = response.xpath('//meta[@name="twitter:url"]').xpath( '@content').get() dic["url"] = news_url article_section = response.xpath( '//meta[@property="article:section"]').xpath('@content').getall() dic["article_section"] = article_section item_summary_SELECTOR = '.item-summary p ::text' if (response.css(item_summary_SELECTOR).extract()): dic["summary"] = response.css(item_summary_SELECTOR).extract()[0] dic["preprocessed_summary"] = preprocess(dic["summary"]) date = response.xpath( '//div[@class="barcode"]/ul/li[@class="date"]/text()').get() if date == None: date = response.xpath( '//div[@class="item-date"]/span/text()').get() list = date.split(' ') # print(list) day = convert_persian_to_english_numbers(list[0]) month = month_dic[list[1]] year = convert_persian_to_english_numbers(list[2]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() time = list[4] # print(convert_persian_to_english_numbers(day)) # print(month_dic[month]) # print(convert_persian_to_english_numbers(year)) list_time = time.split(':') hour = convert_persian_to_english_numbers(list_time[0]) minute = convert_persian_to_english_numbers(list_time[1]) # print(convert_persian_to_english_numbers(hour)) # print(convert_persian_to_english_numbers(minute)) datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath( '//div[@class="barcode"]/ul/li[@class="id"]/span/text()').get() if code == None: code = response.xpath('//input[@id="newsId"]').xpath( '@value').get() dic["code"] = code tags = response.xpath( '//section[@class="box tags"]/div/ul/li/a/text()').getall() for brickset in response.css(item_body_SELECTOR): item_text_SELECTOR = '.item-text p ::text' paragraphs = brickset.css(item_text_SELECTOR).extract() for i in range(0, len(paragraphs) - 1): text = text + '\n' + paragraphs[i] dic["text"] = text dic["tags"] = tags dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): dic = { "title": " ", "timestamp": "", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//h1[@class="Htag"]/a/text()').get() dic["title"] = title news_url = response.css('h1[class=Htag] a::attr(href)').extract()[0] dic["url"] = "https://www.tabnak.ir" + news_url sections = response.xpath( '//div[@class="news_path"]/a/text()').getall() final_sections = [] for sec in sections: processed_text = " ".join(sec.split()) final_sections.append(processed_text) dic["article_section"] = final_sections summary = response.xpath('//div[@class="subtitle"]/text()').getall() dic["summary"] = summary[1] date = response.xpath('//sapn[@class="fa_date"]/text()').get() date_list = date.split(' ') # print(date_list) timelist = date_list[4].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) day = convert_persian_to_english_numbers(date_list[0]) month = month_dic[date_list[1]] year = convert_persian_to_english_numbers(date_list[2]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath('//div[@class="news_id_c"]/text()').get() dic["code"] = " ".join(code.split()) tags = response.xpath('//div[@class="tag_items"]/a/text()').getall() dic["tags"] = tags text_list = response.xpath('//div[@class="body"]/div/text()').getall() text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): dic = { "timestamp": " ", "title": " ", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "preprocessed_text": " ", "w2v": [], "tfidf": [], "code": " " } title = response.xpath('//h1[@class="title"]/a/text()').get() dic["title"] = title dic["preprocessed_title"] = preprocess(dic["title"]) news_url = response.css('h1[class=title] a::attr(href)').extract() if len(news_url) > 0: news_url = news_url[0] dic["url"] = "https://www.asriran.com" + news_url sections = response.xpath( '//div[@class="news_path"]/a/text()').getall() dic["article_section"] = sections[1:] summary = response.xpath('//div[@class="subtitle"]/text()').get() dic["summary"] = summary dic["preprocessed_summary"] = preprocess(dic["summary"]) date_list = response.xpath( '//div[@class="news_nav news_pdate_c"]/text()').getall() if len(date_list) > 0: date = "" for d in date_list: date += d newdate = ''.join(date.split()) list = newdate.split('-') justdate = list[1] justtime = list[0] else: date = response.xpath( '//div[@class="update_date"]/text()').getall()[0] newdatetmp = ''.join(date.split()) tmp = newdatetmp.split(":") newdate = ':'.join(tmp[1:]) list = newdate.split('-') justdate = list[0] justtime = list[1] timelist = justtime.split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) # print(hour) # print(minute) index = 0 for char in justdate: if char not in num_dic: index = justdate.index(char) break day = convert_persian_to_english_numbers(justdate[0:index]) monthandyear = justdate[index:] for char in monthandyear: if char in num_dic: index = monthandyear.index(char) break month = month_dic[monthandyear[0:index]] year = convert_persian_to_english_numbers(monthandyear[index:]) # print(month) # print(year) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() # print(jalili_date) datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) # print(datetime_object) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code_list = response.xpath( '//div[@class="news_nav news_id_c"]/text()').getall() code = "" for c in code_list: code += c dic["code"] = code tags = response.xpath('//div[@class="tags_title"]/a/text()').getall() dic["tags"] = tags text_parts = response.xpath('//div[@class="body"]/p/text()').getall() text = "" for text_part in text_parts: text += text_part if (len(text) < 1): maybe_div = response.xpath( '//div[@class="body"]/div/text()').getall() for d in maybe_div: text += d maybe_p = response.xpath('//div[@class="body"]/p/text()').getall() for p in maybe_p: text += p maybe_s = response.xpath( '//div[@class="body"]/p/span/text()').getall() for s in maybe_s: text += s dic["text"] = text dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles articles.insert_one(dic)
def parse(self, response): dic = { "title": " ", "timestamp": "", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//a[@itemprop="headline"]/text()').get() dic["title"] = title news_url = response.css('h1[class=title] a::attr(href)').extract()[0] dic["url"] = "https://www.mashreghnews.ir" + news_url sections = response.xpath( '//ol[@class="breadcrumb"]/li/a/text()').getall() dic["article_section"] = sections summary = response.xpath( '//p[@class="summary introtext"]/text()').get() dic["summary"] = summary date = response.xpath( '//div[@class="col-xs-8 col-sm-6 item-date"]/span/text()').get() date_list = date.split(' ') timelist = date_list[4].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) day = convert_persian_to_english_numbers(date_list[0]) month = month_dic[date_list[1]] year = convert_persian_to_english_numbers(date_list[2]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath( '//div[@class="col-xs-8 col-sm-3"]/span/text()').getall() dic["code"] = code tags = response.xpath( '//section[@class="box tags clean list-clean list-inline header-inline header-clean negative-margin bg-graylight"]/div/ul/li/a/text()' ).getall() dic["tags"] = tags text_list1 = response.xpath( '//div[@class="item-text"]/p/span/text()').getall() text_list2 = response.xpath( '//div[@class="item-text"]/p/text()').getall() text = "" for t in text_list1: text += t for t in text_list2: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): dic = {"title": " "} title = response.xpath('//div[@class="news-head"]/h6/text()').get() title += response.xpath('//div[@class="news-head"]/h2/text()').get() dic["title"] = title news_url = response.xpath( '//*[@id="st-container"]/div/div/div/main/div[1]/div/div/div/ul/li[3]/a/@href' ).extract()[0] dic["url"] = "http://behdasht.gov.ir" + news_url # news_path sections = response.xpath( '//*[@id="page-content"]/div/article/div/div[2]/div/div/ul/li[1]/text()' ).getall() dic["article_section"] = sections[2] summary = response.xpath('//div[@class="news-lead"]/p/text()').get() dic["summary"] = summary date = response.xpath( '//*[@id="page-content"]/div/div[1]/div/div[1]/div/ul/li[1]/span/text()' ).get() date_list = date.split(' ') # print(date_list) timelist = date_list[5].split(':') hour = timelist[0] # print(hour) minute = timelist[1] # print(minute) # date_list = date_list[0].split("/") # print(date_list) day = date_list[2] month = date_list[1] year = date_list[0] jalili_date = jdatetime.date(1300 + int(year), int(month), int(day)).togregorian() datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath( '//*[@id="page-content"]/div/article/div/div[2]/div/div/ul/li[2]/text()' ).getall() dic["code"] = code[2] tags = response.xpath( '//div[@class="es-news-tags"]/ul/li/a/text()').getall() dic["tags"] = tags text_list = response.xpath( '//div[@class="news-content"]/div/text()').getall() text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
from word2vec import get_word2vec from tfidf import get_tfidt_vector client = MongoClient() db = client['newsdb'] search_text = db.searches articles = db.articles search_result = db.searchresults text = search_text.find().sort("_id", -1)[1000]["text"] search_text = preprocess(text) dic = {"preprocessed_text": search_text} search_v_w2v = get_word2vec(dic) search_v_tfidf = get_tfidt_vector(dic) def similarity(vec, other_vec): dot = np.dot(vec, other_vec) norma = np.linalg.norm(vec) normb = np.linalg.norm(other_vec) cos = dot / (norma * normb) return cos wanted_news = [] # for res in result: # dic = {"text": "", "similar_texts": []}
def parse(self, response): dic = { "title": " ", "timestamp": "", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//h1[@class="title"]/a/text()').get() dic["title"] = title news_url = response.css('h1[class=title] a::attr(href)').extract()[0] dic["url"] = "http://www.shafaf.ir" + news_url sections = [] dic["article_section"] = sections summary = response.xpath('//p[@itemprop="description"]/text()').get() dic["summary"] = summary date = response.xpath( '//div[@class="news_nav news_pdate_c col-sm-16 col-xs-36"]/text()' ).getall() date = date[1] date_list = date.split(' ') timelist = date_list[4].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) day = convert_persian_to_english_numbers(date_list[0]) month = month_dic[date_list[1]] year = convert_persian_to_english_numbers(date_list[2]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath( '//div[@class="news_nav news_id_c"]/text()').get() # code = processed_text = " ".join(code.split()) # code_list = code.split(' ') dic["code"] = code tags = [] dic["tags"] = tags text_list1 = response.xpath('//div[@class="body"]/p/text()').getall() if len(text_list1) == 0: # item-text text_list2 = response.xpath( '//div[@class="body"]/div[@class="item-text"]/p/text()' ).getall() text_list = text_list2 else: text_list = text_list1 text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def search(text): try: client = MongoClient() src_db = client['newsdb_week'] #articles = src_db.searches dst_db = client['webdb'] articles = src_db.weekarticles searchresults = dst_db.searchresults #text = "بازگشایی مدارس تهران" #print(text) search_text = preprocess(text) search_text_tokens = search_text.split(' ') dic = {"preprocessed_text": search_text} search_v_w2v = get_word2vec(dic) search_v_tfidf = get_tfidt_vector(dic) result_w2v_list = [] result_tfidf_list = [] result_exact_list = [] now = datetime.datetime.now() count = 0 for a in articles.find( {"timestamp": { "$gt": now.timestamp() - 5184000.0 }}): count = count + 1 #for a in articles.find({}): if not np.all(search_v_w2v == 0): if not np.all(np.array(a["w2v"]) == 0): if similarity(np.array(a["w2v"]), search_v_w2v) > 0.8: result_w2v_list.append(a) if not np.all(np.array(a["tfidf"]) == 0): if similarity(np.array(a["tfidf"]), search_v_tfidf) > 0.3: result_tfidf_list.append(a) # for token in search_text_tokens: # if token in a["text"]: # result_exact_list.append(a) # break # print("num of documents checked : ") # print(count) print(count) searchresults.delete_many({}) # mydict = { "search_text": text, "result": result_w2v_list, "type": "w2v" } searchresults.insert_one(mydict) # mydict = { "search_text": text, "result": result_tfidf_list, "type": "tfidf" } searchresults.insert_one(mydict) # # mydict = {"search_text": text, "result": result_exact_list, "type": "exact"} # searchresults.insert_one(mydict) # # mydict = {"status": "done"} # searchstatus.insert_one(mydict) print("OK") print(len(result_tfidf_list)) print(len(result_w2v_list)) s = str(len(result_tfidf_list)) + str(len(result_w2v_list)) return s except Exception as e: print(e) print("exception occured") return str(e)