def search(): try: searches = db.searches articles = db.articles searchresults = db.searchresults searchstatus = db.searchstatus text = sys.argv[1] print(text) search_text = preprocess(text) search_text_tokens = search_text.split(' ') dic = {"preprocessed_text": search_text} search_v_w2v = get_word2vec(dic) search_v_tfidf = get_tfidt_vector(dic) result_w2v_list = [] result_tfidf_list = [] result_exact_list = [] now = datetime.datetime.now() count = 0 for a in articles.find( {"timestamp": { "$gt": now.timestamp() - 86400.0 }}): # count = 0 # for a in articles.find(): count += 1 if not np.all(search_v_w2v == 0): if not np.all(np.array(a["w2v"]) == 0): if similarity(np.array(a["w2v"]), search_v_w2v) > 0.3: result_w2v_list.append(a) if not np.all(np.array(a["tfidf"]) == 0): if similarity(np.array(a["tfidf"]), search_v_tfidf) > 0.0: result_tfidf_list.append(a) # for token in search_text_tokens: # if token in a["text"]: # result_exact_list.append(a) # break # print("num of documents checked : ") # print(count) # searchresults.delete_many({}) # # mydict = {"search_text": text, "result": result_w2v_list, "type": "w2v"} # searchresults.insert_one(mydict) # # mydict = {"search_text": text, "result": result_tfidf_list, "type": "tfidf"} # searchresults.insert_one(mydict) # # mydict = {"search_text": text, "result": result_exact_list, "type": "exact"} # searchresults.insert_one(mydict) # # mydict = {"status": "done"} # searchstatus.insert_one(mydict) print(len(result_tfidf_list)) print(len(result_w2v_list)) sys.stdout.flush() except (Exception): print("exception occured") sys.stdout.flush()
def add_vector_for_old_news(): client = MongoClient() db = client['newsdb'] articles = db.articles # count = 0 # b = {} # a = {"id":1} # b = a # print(b) # b["text"] = "salam" # print(b) all_news_list = [] count_of_ah = 0 for a in articles.find(): b = a # if "title" in a : # b["preprocessed_title"] = preprocess(a["title"]) if "summary" in a: b["preprocessed_summary"] = preprocess(a["summary"]) w2v_vector = get_word2vec(b).tolist() tfidf_vector = get_tfidt_vector(b).tolist() b["w2v"] = w2v_vector b["tfidf"] = tfidf_vector all_news_list.append(b) print("done") print("first step done") articles.delete_many({}) print("second step done") for dic in all_news_list: articles.insert_one(dic) print("third step done")
def parse(self, response): dic = {"title":" ", "timestamp": "", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " "} title = response.xpath('//h1[@class="title"]/text()').get() dic["title"] = title news_url = response.xpath('//a[@id="short-link"]/text()').get() dic["url"] = " ".join(news_url.split()) sections = [] dic["article_section"] = sections summary = response.xpath('//h3[@class="lead"]/text()').get() dic["summary"] = summary date = response.xpath('//li[@class="time"]/text()').get() date_list = date.split(' ') timelist = date_list[5].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) day = convert_persian_to_english_numbers(date_list[1]) month = month_dic[date_list[2]] year = convert_persian_to_english_numbers(date_list[3]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = "no code" dic["code"] = code tags = [] dic["tags"] = tags text_list = response.xpath('//div[@class="story"]/p/text()').getall() text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): HtmlResponse = response # resfile = open('resfile_specific.html', 'w') # resfile.write(str(HtmlResponse.body.decode('utf-8'))) # resfile.close() dic = { "title": " ", "timestamp": "", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//h1[@class="title"]/a/text()').get() dic["title"] = title dic["preprocessed_title"] = preprocess(dic["title"]) news_url = response.css('h1[class=title] a::attr(href)').extract()[0] dic["url"] = "http://titrnews.ir" + news_url sections = response.xpath( '//div[@class="news_path"]/a/text()').getall() dic["article_section"] = sections summary = response.xpath('//div[@class="subtitle"]/text()').get() dic["summary"] = summary dic["preprocessed_summary"] = preprocess(dic["summary"]) date = response.xpath( '//div[@class="news_nav news_pdate_c"]/text()').get() date_list = date.split(' ') # print(date_list) timelist = date_list[4].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) # print("hour") # print(hour) # print("minute") # print(minute) # # day = convert_persian_to_english_numbers(date_list[0]) # print("day") # print(day) # month = month_dic[date_list[1]] # print("month") # print(month) # # year = convert_persian_to_english_numbers(date_list[2]) # print("year") # print(year) # jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() # print(jalili_date) datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) # print(datetime_object) # dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath( '//div[@class="news_nav news_id_c"]/text()').get() dic["code"] = code tags = response.xpath('//div[@class="tags_title"]/a/text()').getall() dic["tags"] = tags text_list = response.xpath( '//div[@class="body"]/p/span/text()').getall() text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): dic = {"title":" ", "timestamp": " ", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " "} title = response.xpath('//span[@class="title mb-2 d-block text-justify"]/text()').get() dic["title"] = title news_url = response.css('link[rel=canonical]::attr(href)').extract()[0] dic["url"] = news_url sections = response.xpath('//div[@class="category-name d-flex justify-content-center"]/span/a/text()').getall() final = [] for s in sections: processed_text = " ".join(s.split()) final.append(processed_text) dic["article_section"] = final summary = response.xpath('//p[@class="lead p-2 text-justify"]/text()').get() dic["summary"] = summary date = response.xpath('//div[@class="publish-time d-flex justify-content-center"]/span/text()').getall() if len(date) > 1: timelist = date[0].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) date_list = date[2].split('/') day = convert_persian_to_english_numbers(date_list[2]) month = convert_persian_to_english_numbers(date_list[1]) yearlist = date_list[0].split('،') year = convert_persian_to_english_numbers(yearlist[0]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) else: date = response.xpath('//span[@class="publish-time text-center"]/text()').get() date_list = date.split(' ') timelist = date_list[2].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) d_list = date_list[0].split('/') day = convert_persian_to_english_numbers(d_list[2]) month = convert_persian_to_english_numbers(d_list[1]) year = convert_persian_to_english_numbers(d_list[0]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = datetime_object # # dic["timestamp"] = datetime_object.timestamp() code = '' dic["code"] = code tags = response.xpath('//div[@class="tags mt-4 text-right d-flex flex-wrap"]/a/text()').getall() finaltags = [] for t in tags: processed_text = " ".join(t.split()) finaltags.append(processed_text) dic["tags"] = finaltags # text_list = response.xpath('//div[@class="nt-body text-right mt-4"]/p/text()').getall() text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): dic = {"title": " "} title = response.xpath( '//h1[@class="title h_title_news"]/a/text()').get() dic["title"] = title try: news_url = response.css( 'h1.h_title_news a::attr(href)').extract()[0] except (Exception): news_url = response.css('h1.Htags a::attr(href)').extract()[0] dic["url"] = "https://namehnews.com" + news_url # news_path sections = response.xpath( '//div[@class="news_path"]/a/text()').getall() dic["article_section"] = sections summary = response.xpath( '//div[@class="subtitle sub_news"]/text()').get() dic["summary"] = summary date = response.xpath( '//div[@class="news_nav news_pdate_c col-xs-36 col-sm-14 pull-left"]/text()' ).getall() date = " ".join(date[1].split()) date_list = date.split(' ') timelist = date_list[4].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) day = convert_persian_to_english_numbers(date_list[0]) month = month_dic[date_list[1]] year = convert_persian_to_english_numbers(date_list[2]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath( '//div[@class="news_nav news_id_c col-xs-36 col-sm-11"]/text()' ).getall() code = " ".join(code[1].split()) dic["code"] = code tags = response.xpath('//div[@class="tags_title"]/a/text()').getall() dic["tags"] = tags text_list = response.xpath( '//div[@class="body body_news"]/div/text()').getall() text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def search(text): try: articles = db.articles print("got search text") search_text = preprocess(text) print("done preprocess") # search_text_tokens = search_text.split(' ') # print("got tokens") dic = {"preprocessed_text": search_text} # search_v_w2v = get_word2vec(dic) # print("search vector w2v") search_v_tfidf = get_tfidt_vector(dic) print("search vector tfidf") print(search_v_tfidf) if np.all(search_v_tfidf == 0): return 0 now = datetime.datetime.now() result_w2v_list = [] result_tfidf_list = [] result_exact_list = [] print("lists created") count = 0 for a in articles.find(): count += 1 # for a in articles.find({"timestamp": {"$gt": now.timestamp() - 172800.0}}): # if np.all(np.array(a["tfidf"]) == 0): # # print("ahhhhhhhhhhhhhhhhhhhhhhhhhhhhhh") # if np.all(np.array(a["w2v"])!=0) and np.all(search_v_w2v!=0): # # print("hi") # if similarity(np.array(a["w2v"]), search_v_w2v) > 0.8: # result_w2v_list.append(a) if not np.all(np.array(a["tfidf"]) == 0) and not np.all( search_v_tfidf == 0): print("hi") if similarity(np.array(a["tfidf"]), search_v_tfidf) > 0.2: result_tfidf_list.append(a) # for token in search_text_tokens: # if token in a["text"]: # result_exact_list.append(a) # break print("num of documents checked : ") print(count) print(len(result_tfidf_list)) print(len(result_w2v_list)) print(len(result_exact_list)) file = open("searchresult.txt", "w") res = "search text : \n\n" + text + "\n\n" + "tfidt : \n\n" for r in result_tfidf_list: res += r["url"] + "\n" if "title" in r: res += r["title"] + "\n" if "text" in r: res += r["text"] + "\n" res += "word2vec : \n\n" for r in result_w2v_list: res += r["url"] + "\n" if "title" in r: res += r["title"] + "\n" if "text" in r: res += r["text"] + "\n" res += "exact : \n\n" for r in result_exact_list: res += r["url"] + "\n" if "title" in r: res += r["title"] + "\n" if "text" in r: res += r["text"] + "\n" file.write(res) file.close() return 1 except (Exception): return 0
def parse(self, response): dic = {"title": " "} title = response.xpath('//div[@class="news-head"]/h6/text()').get() title += response.xpath('//div[@class="news-head"]/h2/text()').get() dic["title"] = title news_url = response.xpath( '//*[@id="st-container"]/div/div/div/main/div[1]/div/div/div/ul/li[3]/a/@href' ).extract()[0] dic["url"] = "http://behdasht.gov.ir" + news_url # news_path sections = response.xpath( '//*[@id="page-content"]/div/article/div/div[2]/div/div/ul/li[1]/text()' ).getall() dic["article_section"] = sections[2] summary = response.xpath('//div[@class="news-lead"]/p/text()').get() dic["summary"] = summary date = response.xpath( '//*[@id="page-content"]/div/div[1]/div/div[1]/div/ul/li[1]/span/text()' ).get() date_list = date.split(' ') # print(date_list) timelist = date_list[5].split(':') hour = timelist[0] # print(hour) minute = timelist[1] # print(minute) # date_list = date_list[0].split("/") # print(date_list) day = date_list[2] month = date_list[1] year = date_list[0] jalili_date = jdatetime.date(1300 + int(year), int(month), int(day)).togregorian() datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath( '//*[@id="page-content"]/div/article/div/div[2]/div/div/ul/li[2]/text()' ).getall() dic["code"] = code[2] tags = response.xpath( '//div[@class="es-news-tags"]/ul/li/a/text()').getall() dic["tags"] = tags text_list = response.xpath( '//div[@class="news-content"]/div/text()').getall() text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): dic = { "timestamp": " ", "title": " ", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "preprocessed_text": " ", "w2v": [], "tfidf": [], "code": " " } title = response.xpath('//h1[@class="title"]/a/text()').get() dic["title"] = title dic["preprocessed_title"] = preprocess(dic["title"]) news_url = response.css('h1[class=title] a::attr(href)').extract() if len(news_url) > 0: news_url = news_url[0] dic["url"] = "https://www.asriran.com" + news_url sections = response.xpath( '//div[@class="news_path"]/a/text()').getall() dic["article_section"] = sections[1:] summary = response.xpath('//div[@class="subtitle"]/text()').get() dic["summary"] = summary dic["preprocessed_summary"] = preprocess(dic["summary"]) date_list = response.xpath( '//div[@class="news_nav news_pdate_c"]/text()').getall() if len(date_list) > 0: date = "" for d in date_list: date += d newdate = ''.join(date.split()) list = newdate.split('-') justdate = list[1] justtime = list[0] else: date = response.xpath( '//div[@class="update_date"]/text()').getall()[0] newdatetmp = ''.join(date.split()) tmp = newdatetmp.split(":") newdate = ':'.join(tmp[1:]) list = newdate.split('-') justdate = list[0] justtime = list[1] timelist = justtime.split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) # print(hour) # print(minute) index = 0 for char in justdate: if char not in num_dic: index = justdate.index(char) break day = convert_persian_to_english_numbers(justdate[0:index]) monthandyear = justdate[index:] for char in monthandyear: if char in num_dic: index = monthandyear.index(char) break month = month_dic[monthandyear[0:index]] year = convert_persian_to_english_numbers(monthandyear[index:]) # print(month) # print(year) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() # print(jalili_date) datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) # print(datetime_object) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code_list = response.xpath( '//div[@class="news_nav news_id_c"]/text()').getall() code = "" for c in code_list: code += c dic["code"] = code tags = response.xpath('//div[@class="tags_title"]/a/text()').getall() dic["tags"] = tags text_parts = response.xpath('//div[@class="body"]/p/text()').getall() text = "" for text_part in text_parts: text += text_part if (len(text) < 1): maybe_div = response.xpath( '//div[@class="body"]/div/text()').getall() for d in maybe_div: text += d maybe_p = response.xpath('//div[@class="body"]/p/text()').getall() for p in maybe_p: text += p maybe_s = response.xpath( '//div[@class="body"]/p/span/text()').getall() for s in maybe_s: text += s dic["text"] = text dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles articles.insert_one(dic)
def parse(self, response): dic = { "title": " ", "timestamp": "", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//h1[@class="Htag"]/a/text()').get() dic["title"] = title news_url = response.css('h1[class=Htag] a::attr(href)').extract()[0] dic["url"] = "https://www.tabnak.ir" + news_url sections = response.xpath( '//div[@class="news_path"]/a/text()').getall() final_sections = [] for sec in sections: processed_text = " ".join(sec.split()) final_sections.append(processed_text) dic["article_section"] = final_sections summary = response.xpath('//div[@class="subtitle"]/text()').getall() dic["summary"] = summary[1] date = response.xpath('//sapn[@class="fa_date"]/text()').get() date_list = date.split(' ') # print(date_list) timelist = date_list[4].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) day = convert_persian_to_english_numbers(date_list[0]) month = month_dic[date_list[1]] year = convert_persian_to_english_numbers(date_list[2]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath('//div[@class="news_id_c"]/text()').get() dic["code"] = " ".join(code.split()) tags = response.xpath('//div[@class="tag_items"]/a/text()').getall() dic["tags"] = tags text_list = response.xpath('//div[@class="body"]/div/text()').getall() text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): dic = { "timestamp": "", "title": " ", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//h1[@class="Htags"]/a/text()').get() dic["title"] = title dic["preprocessed_title"] = preprocess(dic["title"]) news_url = response.css('h1[class=Htags] a::attr(href)').extract() if len(news_url) > 0: news_url = news_url[0] dic["url"] = "https://www.yjc.ir" + news_url sections = response.xpath( '//div[@class="news_path"]/a/text()').getall() dic["article_section"] = sections summary = response.xpath('//strong[@class="news_strong"]/text()').get() dic["summary"] = summary dic["preprocessed_summary"] = preprocess(dic["summary"]) date_list = response.xpath( '//div[@class="news_nav news_pdate_c"]/text()').getall() date = "" for d in date_list: date += d list = date.split(' ') # print(list) day = convert_persian_to_english_numbers(list[0]) month = month_dic[list[1]] year = convert_persian_to_english_numbers(list[2]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() time = list[4] # print(convert_persian_to_english_numbers(day)) # print(month_dic[month]) # print(convert_persian_to_english_numbers(year)) list_time = time.split(':') hour = convert_persian_to_english_numbers(list_time[0]) minute = convert_persian_to_english_numbers(list_time[1]) # print(convert_persian_to_english_numbers(hour)) # print(convert_persian_to_english_numbers(minute)) datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code_list = response.xpath( '//div[@class="news_nav news_id_c"]/text()').getall() code = "" for c in code_list: code += c dic["code"] = code tags = response.xpath('//div[@class="tag_items"]/a/text()').getall() dic["tags"] = tags text_parts = response.xpath('//div[@class="body"]/p/text()').getall() text = "" for text_part in text_parts: text += text_part dic["text"] = text dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): HtmlResponse = response # resfile = open('resfile_specific.html', 'w') # resfile.write(str(HtmlResponse.body.decode('utf-8'))) # resfile.close() dic = { "title": " ", "timestamp": "", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//h1[@id="docDiv3TitrMain"]/text()').get() dic["title"] = title dic["preprocessed_title"] = preprocess(dic["title"]) news_url = response.css('a[class=print-icon]::attr(href)').extract()[0] dic["url"] = "http://www.akhbarbank.com" + news_url.replace( "/print", "") sections = response.xpath( '//div[@class="dsinfo-p1-active"]/a/text()').getall() dic["article_section"] = sections summary = response.xpath('//div[@id="docDivLead3"]/div/text()').get() dic["summary"] = summary dic["preprocessed_summary"] = preprocess(dic["summary"]) date = response.xpath('//div[@id="docDiv3Date"]/text()').get() date_list = date.split(' ') # print(date_list) timelist = date_list[5].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) # print("hout") # print(hour) # print("minute") # print(minute) day = convert_persian_to_english_numbers(date_list[1]) # print("day") # print(day) month = month_dic[date_list[2]] # print("month") # print(month) year = convert_persian_to_english_numbers(date_list[3]) # print("year") # print(year) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() # print(jalili_date) datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) # print(datetime_object) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath('//div[@id="docDocID"]/text()').get() dic["code"] = code tags = [] dic["tags"] = tags text_list = response.xpath('//div[@id="doctextarea"]/text()').getall() text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): HtmlResponse = response # resfile = open('resfile_specific.html', 'w') # resfile.write(str(HtmlResponse.body.decode('utf-8'))) # resfile.close() dic = { "title": " ", "timestamp": "", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//h1[@class="title"]/a/text()').get() dic["title"] = title dic["preprocessed_title"] = preprocess(dic["title"]) news_url = response.css('h1[class=title] a::attr(href)').extract() if len(news_url) > 0: news_url = news_url[0] dic["url"] = "http://sobhanehonline.com" + news_url sections = response.xpath( '//div[@class="news_path"]/a/text()').getall() if len(sections) > 0: dic["article_section"] = sections[0] summary = response.xpath('//div[@class="subtitle"]/text()').get() dic["summary"] = summary dic["preprocessed_summary"] = preprocess(dic["summary"]) date = response.xpath( '//div[@class="news_nav news_pdate_c"]/text()').get() list = date.split(' ') # print(list) day = convert_persian_to_english_numbers(list[0]) month = month_dic[list[1]] year = convert_persian_to_english_numbers(list[2]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() time = list[4] # print(convert_persian_to_english_numbers(day)) # print(month_dic[month]) # print(convert_persian_to_english_numbers(year)) list_time = time.split(':') hour = convert_persian_to_english_numbers(list_time[0]) minute = convert_persian_to_english_numbers(list_time[1]) # print(convert_persian_to_english_numbers(hour)) # print(convert_persian_to_english_numbers(minute)) datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath( '//div[@class="news_nav news_id_c"]/text()').get() dic["code"] = code tags = response.xpath('//a[@class="tags_item"]/text()').getall() dic["tags"] = tags text_parts = response.xpath('//div[@align="justify"]/text()').getall() text = "" for text_part in text_parts: text += text_part if (len(text) < 1): maybe_text = response.xpath('//div[@class="body"]/text()').getall() for t in maybe_text: text += t maybe_p = response.xpath('//div[@class="body"]/p/text()').getall() for p in maybe_p: text += p dic["text"] = text dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): dic = { "title": " ", "timestamp": " ", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//h1[@class="title"]/text()').get() dic["title"] = title news_url = response.css( "link[rel='shortlink']::attr(href)").extract()[0] dic["url"] = "http://www.rajanews.com" + news_url sections = [] dic["article_section"] = sections summary = response.xpath('//div[@class="lead"]/text()').get() dic["summary"] = summary date = response.xpath('//div[@class="created"]/span/text()').get() date_list = date.split(' ') timelist = date_list[1].split(':') # print(timelist) hour = timelist[0] minute = timelist[1] second = timelist[2] date_list = date_list[0].split('-') # print(date_list) day = date_list[2] month = date_list[1] year = date_list[0] datetime_object = datetime.datetime(int(year), int(month), int(day), int(hour), int(minute), int(second)) # print(datetime_object) dic["date"] = datetime_object dic["timestamp"] = datetime_object.timestamp() code = response.xpath('//div[@class="news-id"]/text()').get() code = processed_text = " ".join(code.split()) code_list = code.split(' ') dic["code"] = code tags = [] dic["tags"] = tags text_list = response.xpath('//div[@class="body"]/div/text()').getall() text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): dic = { "title": " ", "timestamp": " ", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//h1[@class="first-title"]/text()').get() dic["title"] = title news_url = response.css( 'div[class=form-group] input::attr(value)').extract()[0] dic["url"] = news_url meta_news = response.xpath( '//div[@class="meta-news"]/ul/li/span/text()').getall() try: dic["article_section"] = meta_news[3] except (Exception): dic["article_section"] = [] summary = response.xpath('//p[@class="summary"]/text()').get() dic["summary"] = summary try: date = meta_news[1] except (Exception): date = response.xpath('//time/text()').get() date_list = date.split(' ') # print(date_list) timelist = date_list[4].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) # print("hour") # print(hour) # print("minute") # print(minute) day = convert_persian_to_english_numbers(date_list[0]) # print("day") # print(day) # month = month_dic[date_list[1]] # print("month") # print(month) # year = convert_persian_to_english_numbers(date_list[2]) # print("year") # print(year) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() # print(jalili_date) datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) # print(datetime_object) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() try: dic["code"] = meta_news[5] except: dic["code"] = '' tags = response.xpath( '//footer[@class="tags"]/ul/li/a/text()').getall() dic["tags"] = tags text_list = response.xpath( '//div[@class="item-text"]/p/text()').getall() text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): item_body_SELECTOR = '.item-body' text = " " dic = { "timestamp": "", "url": " ", "title": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } news_url = response.xpath('//meta[@name="twitter:url"]').xpath( '@content').get() dic["url"] = news_url article_section = response.xpath( '//meta[@property="article:section"]').xpath('@content').getall() dic["article_section"] = article_section item_summary_SELECTOR = '.item-summary p ::text' if (response.css(item_summary_SELECTOR).extract()): dic["summary"] = response.css(item_summary_SELECTOR).extract()[0] dic["preprocessed_summary"] = preprocess(dic["summary"]) date = response.xpath( '//div[@class="barcode"]/ul/li[@class="date"]/text()').get() if date == None: date = response.xpath( '//div[@class="item-date"]/span/text()').get() list = date.split(' ') # print(list) day = convert_persian_to_english_numbers(list[0]) month = month_dic[list[1]] year = convert_persian_to_english_numbers(list[2]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() time = list[4] # print(convert_persian_to_english_numbers(day)) # print(month_dic[month]) # print(convert_persian_to_english_numbers(year)) list_time = time.split(':') hour = convert_persian_to_english_numbers(list_time[0]) minute = convert_persian_to_english_numbers(list_time[1]) # print(convert_persian_to_english_numbers(hour)) # print(convert_persian_to_english_numbers(minute)) datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath( '//div[@class="barcode"]/ul/li[@class="id"]/span/text()').get() if code == None: code = response.xpath('//input[@id="newsId"]').xpath( '@value').get() dic["code"] = code tags = response.xpath( '//section[@class="box tags"]/div/ul/li/a/text()').getall() for brickset in response.css(item_body_SELECTOR): item_text_SELECTOR = '.item-text p ::text' paragraphs = brickset.css(item_text_SELECTOR).extract() for i in range(0, len(paragraphs) - 1): text = text + '\n' + paragraphs[i] dic["text"] = text dic["tags"] = tags dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): dic = {"title": " "} title = response.xpath( '//h1[@class="title title_news"]/span/text()').get() dic["title"] = title news_url = response.css('h1.title_news span::attr(href)').extract()[0] dic["url"] = "https://www.iribnews.ir" + news_url sections = response.xpath( '//div[@class="news_path"]/a/text()').getall() dic["article_section"] = sections summary = response.xpath('//p[@class="subtitle"]/text()').get() dic["summary"] = summary date = response.xpath( '//div[@class="news_nav news_pdate_c col-sm-16 col-xs-25"]/text()' ).getall() # dic["date"] = date # print("date:") # print(block["date"][1]) date = date[1] date_list = date.split(' ') # print(date_list) timelist = date_list[21].split(':') hour = convert_persian_to_english_numbers(timelist[0]) # print(hour) minute = convert_persian_to_english_numbers(timelist[1]) # print(minute) # day = convert_persian_to_english_numbers(date_list[17]) # print(day) month = month_dic[date_list[18]] # print(month) year = convert_persian_to_english_numbers(date_list[19]) # print(year) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) # print(datetime_object) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath( '//div[@class="news_nav news_id_c col-sm-10 col-xs-11"]/text()' ).getall() code = " ".join(code[1].split()) dic["code"] = code tags = response.xpath('//div[@class="tags_title"]/a/text()').getall() dic["tags"] = tags text_list = response.xpath( '//div[@class="body body_media_content_show"]/div/text()').getall( ) text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): dic = { "title": " ", "timestamp": "", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//a[@itemprop="headline"]/text()').get() dic["title"] = title news_url = response.css('h1[class=title] a::attr(href)').extract()[0] dic["url"] = "https://www.mashreghnews.ir" + news_url sections = response.xpath( '//ol[@class="breadcrumb"]/li/a/text()').getall() dic["article_section"] = sections summary = response.xpath( '//p[@class="summary introtext"]/text()').get() dic["summary"] = summary date = response.xpath( '//div[@class="col-xs-8 col-sm-6 item-date"]/span/text()').get() date_list = date.split(' ') timelist = date_list[4].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) day = convert_persian_to_english_numbers(date_list[0]) month = month_dic[date_list[1]] year = convert_persian_to_english_numbers(date_list[2]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath( '//div[@class="col-xs-8 col-sm-3"]/span/text()').getall() dic["code"] = code tags = response.xpath( '//section[@class="box tags clean list-clean list-inline header-inline header-clean negative-margin bg-graylight"]/div/ul/li/a/text()' ).getall() dic["tags"] = tags text_list1 = response.xpath( '//div[@class="item-text"]/p/span/text()').getall() text_list2 = response.xpath( '//div[@class="item-text"]/p/text()').getall() text = "" for t in text_list1: text += t for t in text_list2: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def search(): client = MongoClient() src_db = client['newsdb'] articles = src_db.articles dst_db = client['webdb'] searchresults = dst_db.searchresults # text = searches.find().sort("_id", -1)[0]["text"] text = sys.argv[1] # print("this is the text that python received") # print(text) search_text = preprocess(text) search_text_tokens = search_text.split(' ') # print("ok 1") dic = {"preprocessed_text": search_text} search_v_w2v = get_word2vec(dic) # print("ok 2") search_v_tfidf = get_tfidt_vector(dic) # print("ok 3") now = datetime.datetime.now() result_w2v_list = [] result_tfidf_list = [] result_exact_list = [] # for a in articles.find({"timestamp": {"$gt": now.timestamp() - 86400.0}}): count = 0 count_tokens = 0 for a in articles.find({"timestamp": {"$gt": now.timestamp() - 86400.0}}): count += 1 if not np.all(search_v_w2v == 0): if not np.all(np.array(a["w2v"]) == 0): if similarity(np.array(a["w2v"]), search_v_w2v) > 0.8: result_w2v_list.append(a) if not np.all(np.array(a["tfidf"]) == 0): if similarity(np.array(a["tfidf"]), search_v_tfidf) > 0.7: result_tfidf_list.append(a) # for token in search_text_tokens: # if token in a["text"]: # count_tokens += 1 # if count_tokens == len(search_text_tokens): # result_exact_list.append(a) # print("ok 4") searchresults.delete_many({}) # print("ok 5") mydict = {"search_text": text, "result": result_w2v_list, "type": "w2v"} searchresults.insert_one(mydict) # print("ok 6") mydict = {"search_text": text, "result": result_tfidf_list, "type": "tfidf"} searchresults.insert_one(mydict) # print("ok 7") # mydict = {"search_text": text, "result": result_exact_list, "type": "exact"} # searchresults.insert_one(mydict) # print("ok 8") print("ok") sys.stdout.flush()
from tfidf import get_tfidt_vector client = MongoClient() db = client['newsdb'] search_text = db.searches articles = db.articles search_result = db.searchresults text = search_text.find().sort("_id", -1)[1000]["text"] search_text = preprocess(text) dic = {"preprocessed_text": search_text} search_v_w2v = get_word2vec(dic) search_v_tfidf = get_tfidt_vector(dic) def similarity(vec, other_vec): dot = np.dot(vec, other_vec) norma = np.linalg.norm(vec) normb = np.linalg.norm(other_vec) cos = dot / (norma * normb) return cos wanted_news = [] # for res in result: # dic = {"text": "", "similar_texts": []} # dic["text"] = res[0]
def parse(self, response): dic = { "title": " ", "timestamp": " ", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//h1[@class="title"]/a/text()').get() dic["title"] = title news_url = response.css('h1[class=title] a::attr(href)').extract()[0] dic["url"] = "https://www.mehrnews.com" + news_url sections = response.xpath( '//div[@class="col-6 col-sm-4"]/ol[@class="breadcrumb"]/li/a/text()' ).getall() dic["article_section"] = sections summary = response.xpath( '//p[@class="summary introtext"]/text()').get() dic["summary"] = summary date = response.xpath( '//div[@class="col-6 col-sm-4 item-date"]/span/text()').get() date_list = date.split(' ') # print(date_list) timelist = date_list[4].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) # print("hour") # print(hour) # print("minute") # print(minute) # day = convert_persian_to_english_numbers(date_list[0]) # print("day") # print(day) # month = month_dic[date_list[1]] # print("month") # print(month) # yearlist = date_list[2].split('،') # print(yearlist) year = convert_persian_to_english_numbers(yearlist[0]) # print("year") # print(year) # # # jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() # print(jalili_date) datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) # print(datetime_object) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath('//div[@class="item-code"]/span/text()').get() dic["code"] = code tags = response.xpath( '//section[@class="box tags"]/div/ul/li/a/text()').getall() dic["tags"] = tags text_list = response.xpath( '//div[@class="item-text"]/p/text()').getall() text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def search(text): try: client = MongoClient() src_db = client['newsdb_week'] #articles = src_db.searches dst_db = client['webdb'] articles = src_db.weekarticles searchresults = dst_db.searchresults #text = "بازگشایی مدارس تهران" #print(text) search_text = preprocess(text) search_text_tokens = search_text.split(' ') dic = {"preprocessed_text": search_text} search_v_w2v = get_word2vec(dic) search_v_tfidf = get_tfidt_vector(dic) result_w2v_list = [] result_tfidf_list = [] result_exact_list = [] now = datetime.datetime.now() count = 0 for a in articles.find( {"timestamp": { "$gt": now.timestamp() - 5184000.0 }}): count = count + 1 #for a in articles.find({}): if not np.all(search_v_w2v == 0): if not np.all(np.array(a["w2v"]) == 0): if similarity(np.array(a["w2v"]), search_v_w2v) > 0.8: result_w2v_list.append(a) if not np.all(np.array(a["tfidf"]) == 0): if similarity(np.array(a["tfidf"]), search_v_tfidf) > 0.3: result_tfidf_list.append(a) # for token in search_text_tokens: # if token in a["text"]: # result_exact_list.append(a) # break # print("num of documents checked : ") # print(count) print(count) searchresults.delete_many({}) # mydict = { "search_text": text, "result": result_w2v_list, "type": "w2v" } searchresults.insert_one(mydict) # mydict = { "search_text": text, "result": result_tfidf_list, "type": "tfidf" } searchresults.insert_one(mydict) # # mydict = {"search_text": text, "result": result_exact_list, "type": "exact"} # searchresults.insert_one(mydict) # # mydict = {"status": "done"} # searchstatus.insert_one(mydict) print("OK") print(len(result_tfidf_list)) print(len(result_w2v_list)) s = str(len(result_tfidf_list)) + str(len(result_w2v_list)) return s except Exception as e: print(e) print("exception occured") return str(e)
def parse(self, response): dic = { "title": " ", "timestamp": "", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//h1[@class="title"]/a/text()').get() dic["title"] = title news_url = response.css('h1[class=title] a::attr(href)').extract()[0] dic["url"] = "http://www.shafaf.ir" + news_url sections = [] dic["article_section"] = sections summary = response.xpath('//p[@itemprop="description"]/text()').get() dic["summary"] = summary date = response.xpath( '//div[@class="news_nav news_pdate_c col-sm-16 col-xs-36"]/text()' ).getall() date = date[1] date_list = date.split(' ') timelist = date_list[4].split(':') hour = convert_persian_to_english_numbers(timelist[0]) minute = convert_persian_to_english_numbers(timelist[1]) day = convert_persian_to_english_numbers(date_list[0]) month = month_dic[date_list[1]] year = convert_persian_to_english_numbers(date_list[2]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath( '//div[@class="news_nav news_id_c"]/text()').get() # code = processed_text = " ".join(code.split()) # code_list = code.split(' ') dic["code"] = code tags = [] dic["tags"] = tags text_list1 = response.xpath('//div[@class="body"]/p/text()').getall() if len(text_list1) == 0: # item-text text_list2 = response.xpath( '//div[@class="body"]/div[@class="item-text"]/p/text()' ).getall() text_list = text_list2 else: text_list = text_list1 text = "" for t in text_list: text += t dic["text"] = text dic["preprocessed_title"] = preprocess(dic["title"]) dic["preprocessed_summary"] = preprocess(dic["summary"]) dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb_week'] articles = db.weekarticles result = articles.insert_one(dic)
def parse(self, response): dic = { "timestamp": "", "title": " ", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " " } title = response.xpath('//h1[@class="title"]/a/text()').get() dic["title"] = title dic["preprocessed_title"] = preprocess(dic["title"]) news_url = response.css('h1[class=title] a::attr(href)').extract() if len(news_url) > 0: news_url = news_url[0] dic["url"] = "https://www.hamshahrionline.ir" + news_url sections = response.xpath( '//li[@class="breadcrumb-item"]/a/text()').getall() dic["article_section"] = sections[1:] summary = response.xpath('//p[@class="introtext"]/text()').get() dic["summary"] = summary dic["preprocessed_summary"] = preprocess(dic["summary"]) date = response.xpath( '//div[@class="col-6 col-sm-4 col-xl-4 item-date"]/span/text()' ).get() list = date.split(' ') # print(list) day = convert_persian_to_english_numbers(list[1]) month = month_dic[list[2]] year = convert_persian_to_english_numbers(list[3]) jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian() time = list[5] # print(convert_persian_to_english_numbers(day)) # print(month_dic[month]) # print(convert_persian_to_english_numbers(year)) list_time = time.split(':') hour = convert_persian_to_english_numbers(list_time[0]) minute = convert_persian_to_english_numbers(list_time[1]) # print(convert_persian_to_english_numbers(hour)) # print(convert_persian_to_english_numbers(minute)) datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour), int(minute)) dic["date"] = str(datetime_object) dic["timestamp"] = datetime_object.timestamp() code = response.xpath('//div[@class="item-code"]/span/text()').get() dic["code"] = code tags = response.xpath( '//section[@class="box tags"]/div/ul/li/a/text()').getall() dic["tags"] = tags text_parts = response.xpath( '//div[@class="item-text"]/p/text()').getall() text = "" for text_part in text_parts: text += text_part dic["text"] = text dic["preprocessed_text"] = preprocess(dic["text"]) dic["w2v"] = get_word2vec(dic).tolist() dic["tfidf"] = get_tfidt_vector(dic).tolist() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index='newsindex', doc_type='news', body=dic) client = MongoClient() db = client['newsdb'] articles = db.weekarticles result = articles.insert_one(dic)