Python get_tfidt_vector Exemples, tfidf.get_tfidt_vector Python Exemples

Exemple #1

0

Afficher le fichier

def search():
    try:
        searches = db.searches
        articles = db.articles
        searchresults = db.searchresults
        searchstatus = db.searchstatus
        text = sys.argv[1]
        print(text)
        search_text = preprocess(text)
        search_text_tokens = search_text.split(' ')
        dic = {"preprocessed_text": search_text}
        search_v_w2v = get_word2vec(dic)
        search_v_tfidf = get_tfidt_vector(dic)
        result_w2v_list = []
        result_tfidf_list = []
        result_exact_list = []
        now = datetime.datetime.now()
        count = 0
        for a in articles.find(
            {"timestamp": {
                "$gt": now.timestamp() - 86400.0
            }}):
            # count = 0
            # for a in articles.find():
            count += 1
            if not np.all(search_v_w2v == 0):
                if not np.all(np.array(a["w2v"]) == 0):
                    if similarity(np.array(a["w2v"]), search_v_w2v) > 0.3:
                        result_w2v_list.append(a)
                if not np.all(np.array(a["tfidf"]) == 0):
                    if similarity(np.array(a["tfidf"]), search_v_tfidf) > 0.0:
                        result_tfidf_list.append(a)
            # for token in search_text_tokens:
            #     if token in a["text"]:
            #         result_exact_list.append(a)
            #         break
        # print("num of documents checked : ")
        # print(count)

        # searchresults.delete_many({})
        #
        # mydict = {"search_text": text, "result": result_w2v_list, "type": "w2v"}
        # searchresults.insert_one(mydict)
        #
        # mydict = {"search_text": text, "result": result_tfidf_list, "type": "tfidf"}
        # searchresults.insert_one(mydict)
        #
        # mydict = {"search_text": text, "result": result_exact_list, "type": "exact"}
        # searchresults.insert_one(mydict)
        #
        # mydict = {"status": "done"}
        # searchstatus.insert_one(mydict)

        print(len(result_tfidf_list))
        print(len(result_w2v_list))
        sys.stdout.flush()

    except (Exception):
        print("exception occured")
        sys.stdout.flush()

Exemple #2

0

Afficher le fichier

def add_vector_for_old_news():
    client = MongoClient()
    db = client['newsdb']
    articles = db.articles
    # count = 0
    # b = {}
    # a = {"id":1}
    # b = a
    # print(b)
    # b["text"] = "salam"
    # print(b)
    all_news_list = []
    count_of_ah = 0
    for a in articles.find():
        b = a
        # if "title" in a :
        #     b["preprocessed_title"] = preprocess(a["title"])
        if "summary" in a:
            b["preprocessed_summary"] = preprocess(a["summary"])

        w2v_vector = get_word2vec(b).tolist()
        tfidf_vector = get_tfidt_vector(b).tolist()
        b["w2v"] = w2v_vector
        b["tfidf"] = tfidf_vector
        all_news_list.append(b)
        print("done")

    print("first step done")
    articles.delete_many({})
    print("second step done")
    for dic in all_news_list:
        articles.insert_one(dic)
    print("third step done")

Exemple #3

0

Afficher le fichier

    def parse(self, response):

        dic = {"title":" ", "timestamp": "", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " "}

        title = response.xpath('//h1[@class="title"]/text()').get()
        dic["title"] = title

        news_url = response.xpath('//a[@id="short-link"]/text()').get()
        dic["url"] = " ".join(news_url.split())

        sections = []
        dic["article_section"] = sections

        summary = response.xpath('//h3[@class="lead"]/text()').get()
        dic["summary"] = summary

        date = response.xpath('//li[@class="time"]/text()').get()
        date_list = date.split(' ')
        timelist = date_list[5].split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        minute = convert_persian_to_english_numbers(timelist[1])
        day = convert_persian_to_english_numbers(date_list[1])
        month = month_dic[date_list[2]]
        year = convert_persian_to_english_numbers(date_list[3])
        jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian()
        datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour),
                                            int(minute))
        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = "no code"
        dic["code"] = code

        tags = []
        dic["tags"] = tags

        text_list = response.xpath('//div[@class="story"]/p/text()').getall()
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Exemple #4

0

Afficher le fichier

Fichier : spider7.py Projet : sepidmnorozy/backup-crawler

    def parse(self, response):
        HtmlResponse = response
        # resfile = open('resfile_specific.html', 'w')
        # resfile.write(str(HtmlResponse.body.decode('utf-8')))
        # resfile.close()

        dic = {
            "title": " ",
            "timestamp": "",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        title = response.xpath('//h1[@class="title"]/a/text()').get()
        dic["title"] = title
        dic["preprocessed_title"] = preprocess(dic["title"])

        news_url = response.css('h1[class=title] a::attr(href)').extract()[0]
        dic["url"] = "http://titrnews.ir" + news_url

        sections = response.xpath(
            '//div[@class="news_path"]/a/text()').getall()
        dic["article_section"] = sections

        summary = response.xpath('//div[@class="subtitle"]/text()').get()
        dic["summary"] = summary
        dic["preprocessed_summary"] = preprocess(dic["summary"])

        date = response.xpath(
            '//div[@class="news_nav news_pdate_c"]/text()').get()
        date_list = date.split(' ')
        # print(date_list)
        timelist = date_list[4].split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        minute = convert_persian_to_english_numbers(timelist[1])
        # print("hour")
        # print(hour)
        # print("minute")
        # print(minute)
        #
        #
        day = convert_persian_to_english_numbers(date_list[0])
        # print("day")
        # print(day)
        #
        month = month_dic[date_list[1]]
        # print("month")
        # print(month)
        #
        #
        year = convert_persian_to_english_numbers(date_list[2])
        # print("year")
        # print(year)
        #
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        # print(jalili_date)
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))
        # print(datetime_object)

        #
        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath(
            '//div[@class="news_nav news_id_c"]/text()').get()
        dic["code"] = code

        tags = response.xpath('//div[@class="tags_title"]/a/text()').getall()
        dic["tags"] = tags

        text_list = response.xpath(
            '//div[@class="body"]/p/span/text()').getall()
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Exemple #5

0

Afficher le fichier

    def parse(self, response):

        dic = {"title":" ", "timestamp": " ", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " "}

        title = response.xpath('//span[@class="title mb-2 d-block text-justify"]/text()').get()
        dic["title"] = title

        news_url = response.css('link[rel=canonical]::attr(href)').extract()[0]
        dic["url"] = news_url

        sections = response.xpath('//div[@class="category-name d-flex justify-content-center"]/span/a/text()').getall()
        final = []
        for s in sections:
            processed_text = " ".join(s.split())
            final.append(processed_text)
        dic["article_section"] = final

        summary = response.xpath('//p[@class="lead p-2 text-justify"]/text()').get()
        dic["summary"] = summary

        date = response.xpath('//div[@class="publish-time d-flex justify-content-center"]/span/text()').getall()
        if len(date) > 1:
            timelist = date[0].split(':')
            hour = convert_persian_to_english_numbers(timelist[0])
            minute = convert_persian_to_english_numbers(timelist[1])

            date_list = date[2].split('/')

            day = convert_persian_to_english_numbers(date_list[2])

            month = convert_persian_to_english_numbers(date_list[1])

            yearlist = date_list[0].split('،')

            year = convert_persian_to_english_numbers(yearlist[0])

            jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian()

            datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour),
                                                int(minute))

            dic["date"] = str(datetime_object)
        else:
            date = response.xpath('//span[@class="publish-time text-center"]/text()').get()
            date_list = date.split(' ')

            timelist = date_list[2].split(':')
            hour = convert_persian_to_english_numbers(timelist[0])
            minute = convert_persian_to_english_numbers(timelist[1])

            d_list = date_list[0].split('/')

            day = convert_persian_to_english_numbers(d_list[2])

            month = convert_persian_to_english_numbers(d_list[1])

            year = convert_persian_to_english_numbers(d_list[0])

            jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian()

            datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour),
                                                int(minute))

            dic["date"] = datetime_object

        #
        #
        dic["timestamp"] = datetime_object.timestamp()
        code = ''
        dic["code"] = code

        tags = response.xpath('//div[@class="tags mt-4 text-right d-flex flex-wrap"]/a/text()').getall()
        finaltags = []
        for t in tags:
            processed_text = " ".join(t.split())
            finaltags.append(processed_text)
        dic["tags"] = finaltags
        #
        text_list = response.xpath('//div[@class="nt-body text-right mt-4"]/p/text()').getall()
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Exemple #6

0

Afficher le fichier

Fichier : spider20.py Projet : sepidmnorozy/backup-crawler

    def parse(self, response):

        dic = {"title": " "}

        title = response.xpath(
            '//h1[@class="title h_title_news"]/a/text()').get()
        dic["title"] = title

        try:
            news_url = response.css(
                'h1.h_title_news a::attr(href)').extract()[0]
        except (Exception):
            news_url = response.css('h1.Htags a::attr(href)').extract()[0]

        dic["url"] = "https://namehnews.com" + news_url

        # news_path
        sections = response.xpath(
            '//div[@class="news_path"]/a/text()').getall()
        dic["article_section"] = sections

        summary = response.xpath(
            '//div[@class="subtitle sub_news"]/text()').get()
        dic["summary"] = summary

        date = response.xpath(
            '//div[@class="news_nav news_pdate_c col-xs-36 col-sm-14 pull-left"]/text()'
        ).getall()
        date = " ".join(date[1].split())

        date_list = date.split(' ')
        timelist = date_list[4].split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        minute = convert_persian_to_english_numbers(timelist[1])

        day = convert_persian_to_english_numbers(date_list[0])
        month = month_dic[date_list[1]]
        year = convert_persian_to_english_numbers(date_list[2])
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))

        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath(
            '//div[@class="news_nav news_id_c col-xs-36 col-sm-11"]/text()'
        ).getall()
        code = " ".join(code[1].split())
        dic["code"] = code

        tags = response.xpath('//div[@class="tags_title"]/a/text()').getall()
        dic["tags"] = tags

        text_list = response.xpath(
            '//div[@class="body body_news"]/div/text()').getall()
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Exemple #7

0

Afficher le fichier

def search(text):
    try:
        articles = db.articles
        print("got search text")
        search_text = preprocess(text)
        print("done preprocess")
        # search_text_tokens = search_text.split(' ')
        # print("got tokens")
        dic = {"preprocessed_text": search_text}
        # search_v_w2v =  get_word2vec(dic)
        # print("search vector w2v")
        search_v_tfidf = get_tfidt_vector(dic)
        print("search vector tfidf")
        print(search_v_tfidf)
        if np.all(search_v_tfidf == 0):
            return 0
        now = datetime.datetime.now()
        result_w2v_list = []
        result_tfidf_list = []
        result_exact_list = []
        print("lists created")
        count = 0
        for a in articles.find():
            count += 1
            # for a in articles.find({"timestamp": {"$gt": now.timestamp() - 172800.0}}):
            #     if np.all(np.array(a["tfidf"]) == 0):
            #         # print("ahhhhhhhhhhhhhhhhhhhhhhhhhhhhhh")
            # if np.all(np.array(a["w2v"])!=0) and np.all(search_v_w2v!=0):
            #     # print("hi")
            #     if similarity(np.array(a["w2v"]), search_v_w2v) > 0.8:
            #         result_w2v_list.append(a)
            if not np.all(np.array(a["tfidf"]) == 0) and not np.all(
                    search_v_tfidf == 0):
                print("hi")
                if similarity(np.array(a["tfidf"]), search_v_tfidf) > 0.2:
                    result_tfidf_list.append(a)
            # for token in search_text_tokens:
            #     if token in a["text"]:
            #         result_exact_list.append(a)
            #         break
        print("num of documents checked : ")
        print(count)
        print(len(result_tfidf_list))
        print(len(result_w2v_list))
        print(len(result_exact_list))

        file = open("searchresult.txt", "w")
        res = "search text : \n\n" + text + "\n\n" + "tfidt : \n\n"

        for r in result_tfidf_list:
            res += r["url"] + "\n"
            if "title" in r:
                res += r["title"] + "\n"
            if "text" in r:
                res += r["text"] + "\n"
        res += "word2vec : \n\n"
        for r in result_w2v_list:
            res += r["url"] + "\n"
            if "title" in r:
                res += r["title"] + "\n"
            if "text" in r:
                res += r["text"] + "\n"
        res += "exact : \n\n"
        for r in result_exact_list:
            res += r["url"] + "\n"
            if "title" in r:
                res += r["title"] + "\n"
            if "text" in r:
                res += r["text"] + "\n"
        file.write(res)
        file.close()
        return 1
    except (Exception):
        return 0

Exemple #8

0

Afficher le fichier

Fichier : spider22.py Projet : sepidmnorozy/backup-crawler

    def parse(self, response):

        dic = {"title": " "}

        title = response.xpath('//div[@class="news-head"]/h6/text()').get()
        title += response.xpath('//div[@class="news-head"]/h2/text()').get()
        dic["title"] = title

        news_url = response.xpath(
            '//*[@id="st-container"]/div/div/div/main/div[1]/div/div/div/ul/li[3]/a/@href'
        ).extract()[0]
        dic["url"] = "http://behdasht.gov.ir" + news_url

        # news_path
        sections = response.xpath(
            '//*[@id="page-content"]/div/article/div/div[2]/div/div/ul/li[1]/text()'
        ).getall()
        dic["article_section"] = sections[2]

        summary = response.xpath('//div[@class="news-lead"]/p/text()').get()
        dic["summary"] = summary

        date = response.xpath(
            '//*[@id="page-content"]/div/div[1]/div/div[1]/div/ul/li[1]/span/text()'
        ).get()
        date_list = date.split(' ')
        # print(date_list)
        timelist = date_list[5].split(':')
        hour = timelist[0]
        # print(hour)
        minute = timelist[1]
        # print(minute)
        #
        date_list = date_list[0].split("/")
        # print(date_list)
        day = date_list[2]
        month = date_list[1]
        year = date_list[0]
        jalili_date = jdatetime.date(1300 + int(year), int(month),
                                     int(day)).togregorian()
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))

        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath(
            '//*[@id="page-content"]/div/article/div/div[2]/div/div/ul/li[2]/text()'
        ).getall()
        dic["code"] = code[2]

        tags = response.xpath(
            '//div[@class="es-news-tags"]/ul/li/a/text()').getall()
        dic["tags"] = tags

        text_list = response.xpath(
            '//div[@class="news-content"]/div/text()').getall()
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Exemple #9

0

Afficher le fichier

    def parse(self, response):

        dic = {
            "timestamp": " ",
            "title": " ",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "preprocessed_text": " ",
            "w2v": [],
            "tfidf": [],
            "code": " "
        }

        title = response.xpath('//h1[@class="title"]/a/text()').get()
        dic["title"] = title
        dic["preprocessed_title"] = preprocess(dic["title"])

        news_url = response.css('h1[class=title] a::attr(href)').extract()
        if len(news_url) > 0:
            news_url = news_url[0]
        dic["url"] = "https://www.asriran.com" + news_url

        sections = response.xpath(
            '//div[@class="news_path"]/a/text()').getall()
        dic["article_section"] = sections[1:]

        summary = response.xpath('//div[@class="subtitle"]/text()').get()
        dic["summary"] = summary
        dic["preprocessed_summary"] = preprocess(dic["summary"])

        date_list = response.xpath(
            '//div[@class="news_nav news_pdate_c"]/text()').getall()

        if len(date_list) > 0:
            date = ""
            for d in date_list:
                date += d
            newdate = ''.join(date.split())
            list = newdate.split('-')
            justdate = list[1]
            justtime = list[0]
        else:
            date = response.xpath(
                '//div[@class="update_date"]/text()').getall()[0]
            newdatetmp = ''.join(date.split())
            tmp = newdatetmp.split(":")
            newdate = ':'.join(tmp[1:])
            list = newdate.split('-')
            justdate = list[0]
            justtime = list[1]

        timelist = justtime.split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        minute = convert_persian_to_english_numbers(timelist[1])
        # print(hour)
        # print(minute)
        index = 0
        for char in justdate:
            if char not in num_dic:
                index = justdate.index(char)
                break
        day = convert_persian_to_english_numbers(justdate[0:index])
        monthandyear = justdate[index:]

        for char in monthandyear:
            if char in num_dic:
                index = monthandyear.index(char)
                break

        month = month_dic[monthandyear[0:index]]
        year = convert_persian_to_english_numbers(monthandyear[index:])
        # print(month)
        # print(year)
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        # print(jalili_date)
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))
        # print(datetime_object)

        dic["date"] = str(datetime_object)

        dic["timestamp"] = datetime_object.timestamp()

        code_list = response.xpath(
            '//div[@class="news_nav news_id_c"]/text()').getall()
        code = ""
        for c in code_list:
            code += c
        dic["code"] = code

        tags = response.xpath('//div[@class="tags_title"]/a/text()').getall()
        dic["tags"] = tags

        text_parts = response.xpath('//div[@class="body"]/p/text()').getall()

        text = ""
        for text_part in text_parts:
            text += text_part

        if (len(text) < 1):
            maybe_div = response.xpath(
                '//div[@class="body"]/div/text()').getall()
            for d in maybe_div:
                text += d
            maybe_p = response.xpath('//div[@class="body"]/p/text()').getall()
            for p in maybe_p:
                text += p
            maybe_s = response.xpath(
                '//div[@class="body"]/p/span/text()').getall()
            for s in maybe_s:
                text += s

        dic["text"] = text

        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        articles.insert_one(dic)

Exemple #10

0

Afficher le fichier

    def parse(self, response):

        dic = {
            "title": " ",
            "timestamp": "",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        title = response.xpath('//h1[@class="Htag"]/a/text()').get()
        dic["title"] = title

        news_url = response.css('h1[class=Htag] a::attr(href)').extract()[0]
        dic["url"] = "https://www.tabnak.ir" + news_url

        sections = response.xpath(
            '//div[@class="news_path"]/a/text()').getall()
        final_sections = []
        for sec in sections:
            processed_text = " ".join(sec.split())
            final_sections.append(processed_text)

        dic["article_section"] = final_sections

        summary = response.xpath('//div[@class="subtitle"]/text()').getall()
        dic["summary"] = summary[1]

        date = response.xpath('//sapn[@class="fa_date"]/text()').get()
        date_list = date.split(' ')
        # print(date_list)
        timelist = date_list[4].split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        minute = convert_persian_to_english_numbers(timelist[1])
        day = convert_persian_to_english_numbers(date_list[0])
        month = month_dic[date_list[1]]
        year = convert_persian_to_english_numbers(date_list[2])
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))

        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath('//div[@class="news_id_c"]/text()').get()
        dic["code"] = " ".join(code.split())

        tags = response.xpath('//div[@class="tag_items"]/a/text()').getall()
        dic["tags"] = tags

        text_list = response.xpath('//div[@class="body"]/div/text()').getall()
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Exemple #11

0

Afficher le fichier

Fichier : spider2.py Projet : sepidmnorozy/backup-crawler

    def parse(self, response):
        dic = {
            "timestamp": "",
            "title": " ",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        title = response.xpath('//h1[@class="Htags"]/a/text()').get()
        dic["title"] = title
        dic["preprocessed_title"] = preprocess(dic["title"])

        news_url = response.css('h1[class=Htags] a::attr(href)').extract()
        if len(news_url) > 0:
            news_url = news_url[0]
        dic["url"] = "https://www.yjc.ir" + news_url

        sections = response.xpath(
            '//div[@class="news_path"]/a/text()').getall()
        dic["article_section"] = sections

        summary = response.xpath('//strong[@class="news_strong"]/text()').get()
        dic["summary"] = summary
        dic["preprocessed_summary"] = preprocess(dic["summary"])

        date_list = response.xpath(
            '//div[@class="news_nav news_pdate_c"]/text()').getall()
        date = ""
        for d in date_list:
            date += d
        list = date.split(' ')
        # print(list)
        day = convert_persian_to_english_numbers(list[0])
        month = month_dic[list[1]]
        year = convert_persian_to_english_numbers(list[2])
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        time = list[4]
        # print(convert_persian_to_english_numbers(day))
        # print(month_dic[month])
        # print(convert_persian_to_english_numbers(year))

        list_time = time.split(':')
        hour = convert_persian_to_english_numbers(list_time[0])
        minute = convert_persian_to_english_numbers(list_time[1])
        # print(convert_persian_to_english_numbers(hour))
        # print(convert_persian_to_english_numbers(minute))
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))

        dic["date"] = str(datetime_object)

        dic["timestamp"] = datetime_object.timestamp()

        code_list = response.xpath(
            '//div[@class="news_nav news_id_c"]/text()').getall()
        code = ""
        for c in code_list:
            code += c
        dic["code"] = code

        tags = response.xpath('//div[@class="tag_items"]/a/text()').getall()
        dic["tags"] = tags

        text_parts = response.xpath('//div[@class="body"]/p/text()').getall()

        text = ""
        for text_part in text_parts:
            text += text_part

        dic["text"] = text

        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Exemple #12

0

Afficher le fichier

Fichier : spider6.py Projet : sepidmnorozy/backup-crawler

    def parse(self, response):
        HtmlResponse = response
        # resfile = open('resfile_specific.html', 'w')
        # resfile.write(str(HtmlResponse.body.decode('utf-8')))
        # resfile.close()

        dic = {
            "title": " ",
            "timestamp": "",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        title = response.xpath('//h1[@id="docDiv3TitrMain"]/text()').get()
        dic["title"] = title
        dic["preprocessed_title"] = preprocess(dic["title"])

        news_url = response.css('a[class=print-icon]::attr(href)').extract()[0]
        dic["url"] = "http://www.akhbarbank.com" + news_url.replace(
            "/print", "")

        sections = response.xpath(
            '//div[@class="dsinfo-p1-active"]/a/text()').getall()
        dic["article_section"] = sections

        summary = response.xpath('//div[@id="docDivLead3"]/div/text()').get()
        dic["summary"] = summary
        dic["preprocessed_summary"] = preprocess(dic["summary"])

        date = response.xpath('//div[@id="docDiv3Date"]/text()').get()
        date_list = date.split(' ')
        # print(date_list)
        timelist = date_list[5].split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        minute = convert_persian_to_english_numbers(timelist[1])
        # print("hout")
        # print(hour)
        # print("minute")
        # print(minute)

        day = convert_persian_to_english_numbers(date_list[1])
        # print("day")
        # print(day)

        month = month_dic[date_list[2]]
        # print("month")
        # print(month)

        year = convert_persian_to_english_numbers(date_list[3])
        # print("year")
        # print(year)

        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        # print(jalili_date)
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))
        # print(datetime_object)

        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath('//div[@id="docDocID"]/text()').get()
        dic["code"] = code

        tags = []
        dic["tags"] = tags

        text_list = response.xpath('//div[@id="doctextarea"]/text()').getall()
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Exemple #13

0

Afficher le fichier

Fichier : spider5.py Projet : sepidmnorozy/backup-crawler

    def parse(self, response):
        HtmlResponse = response
        # resfile = open('resfile_specific.html', 'w')
        # resfile.write(str(HtmlResponse.body.decode('utf-8')))
        # resfile.close()

        dic = {
            "title": " ",
            "timestamp": "",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }
        title = response.xpath('//h1[@class="title"]/a/text()').get()
        dic["title"] = title
        dic["preprocessed_title"] = preprocess(dic["title"])
        news_url = response.css('h1[class=title] a::attr(href)').extract()
        if len(news_url) > 0:
            news_url = news_url[0]
        dic["url"] = "http://sobhanehonline.com" + news_url

        sections = response.xpath(
            '//div[@class="news_path"]/a/text()').getall()
        if len(sections) > 0:
            dic["article_section"] = sections[0]

        summary = response.xpath('//div[@class="subtitle"]/text()').get()
        dic["summary"] = summary
        dic["preprocessed_summary"] = preprocess(dic["summary"])

        date = response.xpath(
            '//div[@class="news_nav news_pdate_c"]/text()').get()
        list = date.split(' ')
        # print(list)
        day = convert_persian_to_english_numbers(list[0])
        month = month_dic[list[1]]
        year = convert_persian_to_english_numbers(list[2])
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        time = list[4]
        # print(convert_persian_to_english_numbers(day))
        # print(month_dic[month])
        # print(convert_persian_to_english_numbers(year))

        list_time = time.split(':')
        hour = convert_persian_to_english_numbers(list_time[0])
        minute = convert_persian_to_english_numbers(list_time[1])
        # print(convert_persian_to_english_numbers(hour))
        # print(convert_persian_to_english_numbers(minute))
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))

        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath(
            '//div[@class="news_nav news_id_c"]/text()').get()
        dic["code"] = code

        tags = response.xpath('//a[@class="tags_item"]/text()').getall()
        dic["tags"] = tags

        text_parts = response.xpath('//div[@align="justify"]/text()').getall()

        text = ""
        for text_part in text_parts:
            text += text_part

        if (len(text) < 1):
            maybe_text = response.xpath('//div[@class="body"]/text()').getall()
            for t in maybe_text:
                text += t
            maybe_p = response.xpath('//div[@class="body"]/p/text()').getall()
            for p in maybe_p:
                text += p

        dic["text"] = text
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Exemple #14

0

Afficher le fichier

Fichier : spider13.py Projet : sepidmnorozy/newsFiles

    def parse(self, response):

        dic = {
            "title": " ",
            "timestamp": " ",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        title = response.xpath('//h1[@class="title"]/text()').get()
        dic["title"] = title

        news_url = response.css(
            "link[rel='shortlink']::attr(href)").extract()[0]
        dic["url"] = "http://www.rajanews.com" + news_url

        sections = []
        dic["article_section"] = sections

        summary = response.xpath('//div[@class="lead"]/text()').get()
        dic["summary"] = summary

        date = response.xpath('//div[@class="created"]/span/text()').get()
        date_list = date.split(' ')
        timelist = date_list[1].split(':')
        # print(timelist)
        hour = timelist[0]
        minute = timelist[1]
        second = timelist[2]
        date_list = date_list[0].split('-')
        # print(date_list)
        day = date_list[2]
        month = date_list[1]
        year = date_list[0]
        datetime_object = datetime.datetime(int(year), int(month), int(day),
                                            int(hour), int(minute),
                                            int(second))
        # print(datetime_object)
        dic["date"] = datetime_object
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath('//div[@class="news-id"]/text()').get()
        code = processed_text = " ".join(code.split())
        code_list = code.split(' ')
        dic["code"] = code

        tags = []
        dic["tags"] = tags

        text_list = response.xpath('//div[@class="body"]/div/text()').getall()
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Exemple #15

0

Afficher le fichier

Fichier : spider9.py Projet : sepidmnorozy/backup-crawler

    def parse(self, response):

        dic = {
            "title": " ",
            "timestamp": " ",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        title = response.xpath('//h1[@class="first-title"]/text()').get()
        dic["title"] = title

        news_url = response.css(
            'div[class=form-group] input::attr(value)').extract()[0]
        dic["url"] = news_url

        meta_news = response.xpath(
            '//div[@class="meta-news"]/ul/li/span/text()').getall()

        try:
            dic["article_section"] = meta_news[3]
        except (Exception):
            dic["article_section"] = []

        summary = response.xpath('//p[@class="summary"]/text()').get()
        dic["summary"] = summary

        try:
            date = meta_news[1]
        except (Exception):
            date = response.xpath('//time/text()').get()

        date_list = date.split(' ')
        # print(date_list)
        timelist = date_list[4].split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        minute = convert_persian_to_english_numbers(timelist[1])
        # print("hour")
        # print(hour)
        # print("minute")
        # print(minute)

        day = convert_persian_to_english_numbers(date_list[0])
        # print("day")
        # print(day)
        #
        month = month_dic[date_list[1]]
        # print("month")
        # print(month)
        #

        year = convert_persian_to_english_numbers(date_list[2])
        # print("year")
        # print(year)

        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        # print(jalili_date)
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))
        # print(datetime_object)

        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        try:
            dic["code"] = meta_news[5]
        except:
            dic["code"] = ''

        tags = response.xpath(
            '//footer[@class="tags"]/ul/li/a/text()').getall()
        dic["tags"] = tags

        text_list = response.xpath(
            '//div[@class="item-text"]/p/text()').getall()
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Exemple #16

0

Afficher le fichier

Fichier : spider4.py Projet : sepidmnorozy/newsFiles

    def parse(self, response):

        item_body_SELECTOR = '.item-body'
        text = " "
        dic = {
            "timestamp": "",
            "url": " ",
            "title": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        news_url = response.xpath('//meta[@name="twitter:url"]').xpath(
            '@content').get()
        dic["url"] = news_url

        article_section = response.xpath(
            '//meta[@property="article:section"]').xpath('@content').getall()
        dic["article_section"] = article_section

        item_summary_SELECTOR = '.item-summary p ::text'
        if (response.css(item_summary_SELECTOR).extract()):
            dic["summary"] = response.css(item_summary_SELECTOR).extract()[0]
            dic["preprocessed_summary"] = preprocess(dic["summary"])

        date = response.xpath(
            '//div[@class="barcode"]/ul/li[@class="date"]/text()').get()
        if date == None:
            date = response.xpath(
                '//div[@class="item-date"]/span/text()').get()
        list = date.split(' ')
        # print(list)
        day = convert_persian_to_english_numbers(list[0])
        month = month_dic[list[1]]
        year = convert_persian_to_english_numbers(list[2])
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        time = list[4]
        # print(convert_persian_to_english_numbers(day))
        # print(month_dic[month])
        # print(convert_persian_to_english_numbers(year))

        list_time = time.split(':')
        hour = convert_persian_to_english_numbers(list_time[0])
        minute = convert_persian_to_english_numbers(list_time[1])
        # print(convert_persian_to_english_numbers(hour))
        # print(convert_persian_to_english_numbers(minute))
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))

        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath(
            '//div[@class="barcode"]/ul/li[@class="id"]/span/text()').get()
        if code == None:
            code = response.xpath('//input[@id="newsId"]').xpath(
                '@value').get()
        dic["code"] = code

        tags = response.xpath(
            '//section[@class="box tags"]/div/ul/li/a/text()').getall()

        for brickset in response.css(item_body_SELECTOR):
            item_text_SELECTOR = '.item-text p ::text'
            paragraphs = brickset.css(item_text_SELECTOR).extract()
            for i in range(0, len(paragraphs) - 1):
                text = text + '\n' + paragraphs[i]

        dic["text"] = text

        dic["tags"] = tags

        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Exemple #17

0

Afficher le fichier

    def parse(self, response):

        dic = {"title": " "}

        title = response.xpath(
            '//h1[@class="title title_news"]/span/text()').get()
        dic["title"] = title

        news_url = response.css('h1.title_news span::attr(href)').extract()[0]
        dic["url"] = "https://www.iribnews.ir" + news_url

        sections = response.xpath(
            '//div[@class="news_path"]/a/text()').getall()
        dic["article_section"] = sections

        summary = response.xpath('//p[@class="subtitle"]/text()').get()
        dic["summary"] = summary

        date = response.xpath(
            '//div[@class="news_nav news_pdate_c col-sm-16 col-xs-25"]/text()'
        ).getall()
        # dic["date"] = date
        # print("date:")
        # print(block["date"][1])
        date = date[1]
        date_list = date.split(' ')
        # print(date_list)
        timelist = date_list[21].split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        # print(hour)
        minute = convert_persian_to_english_numbers(timelist[1])
        # print(minute)
        #
        day = convert_persian_to_english_numbers(date_list[17])
        # print(day)
        month = month_dic[date_list[18]]
        # print(month)
        year = convert_persian_to_english_numbers(date_list[19])
        # print(year)
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))
        # print(datetime_object)
        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath(
            '//div[@class="news_nav news_id_c col-sm-10  col-xs-11"]/text()'
        ).getall()
        code = " ".join(code[1].split())
        dic["code"] = code

        tags = response.xpath('//div[@class="tags_title"]/a/text()').getall()
        dic["tags"] = tags

        text_list = response.xpath(
            '//div[@class="body body_media_content_show"]/div/text()').getall(
            )
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Exemple #18

0

Afficher le fichier

Fichier : spider14.py Projet : sepidmnorozy/backup-crawler

    def parse(self, response):

        dic = {
            "title": " ",
            "timestamp": "",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        title = response.xpath('//a[@itemprop="headline"]/text()').get()
        dic["title"] = title

        news_url = response.css('h1[class=title] a::attr(href)').extract()[0]
        dic["url"] = "https://www.mashreghnews.ir" + news_url

        sections = response.xpath(
            '//ol[@class="breadcrumb"]/li/a/text()').getall()
        dic["article_section"] = sections

        summary = response.xpath(
            '//p[@class="summary introtext"]/text()').get()
        dic["summary"] = summary

        date = response.xpath(
            '//div[@class="col-xs-8 col-sm-6 item-date"]/span/text()').get()
        date_list = date.split(' ')

        timelist = date_list[4].split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        minute = convert_persian_to_english_numbers(timelist[1])

        day = convert_persian_to_english_numbers(date_list[0])

        month = month_dic[date_list[1]]

        year = convert_persian_to_english_numbers(date_list[2])

        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()

        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))

        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath(
            '//div[@class="col-xs-8 col-sm-3"]/span/text()').getall()
        dic["code"] = code

        tags = response.xpath(
            '//section[@class="box tags clean list-clean list-inline header-inline header-clean negative-margin bg-graylight"]/div/ul/li/a/text()'
        ).getall()
        dic["tags"] = tags

        text_list1 = response.xpath(
            '//div[@class="item-text"]/p/span/text()').getall()
        text_list2 = response.xpath(
            '//div[@class="item-text"]/p/text()').getall()
        text = ""
        for t in text_list1:
            text += t
        for t in text_list2:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Exemple #19

0

Afficher le fichier

Fichier : search.py Projet : sepidmnorozy/newsFiles

def search():
    client = MongoClient()
    src_db = client['newsdb']
    articles = src_db.articles
    dst_db = client['webdb']
    searchresults = dst_db.searchresults
    # text = searches.find().sort("_id", -1)[0]["text"]
    text = sys.argv[1]
    # print("this is the text that python received")
    # print(text)

    search_text = preprocess(text)

    search_text_tokens = search_text.split(' ')
    # print("ok 1")
    dic = {"preprocessed_text": search_text}
    search_v_w2v = get_word2vec(dic)
    # print("ok 2")
    search_v_tfidf = get_tfidt_vector(dic)
    # print("ok 3")

    now = datetime.datetime.now()
    result_w2v_list = []
    result_tfidf_list = []
    result_exact_list = []

    # for a in articles.find({"timestamp": {"$gt": now.timestamp() - 86400.0}}):
    count = 0
    count_tokens = 0

    for a in articles.find({"timestamp": {"$gt": now.timestamp() - 86400.0}}):
        count += 1
        if not np.all(search_v_w2v == 0):
            if not np.all(np.array(a["w2v"]) == 0):
               if similarity(np.array(a["w2v"]), search_v_w2v) > 0.8:
                   result_w2v_list.append(a)
            if not np.all(np.array(a["tfidf"]) == 0):
               if similarity(np.array(a["tfidf"]), search_v_tfidf) > 0.7:
                   result_tfidf_list.append(a)
            # for token in search_text_tokens:
            #     if token in a["text"]:
            #         count_tokens += 1
            #         if count_tokens == len(search_text_tokens):
            #             result_exact_list.append(a)


    # print("ok 4")
    searchresults.delete_many({})
    # print("ok 5")
    mydict = {"search_text": text, "result": result_w2v_list, "type": "w2v"}
    searchresults.insert_one(mydict)

    # print("ok 6")

    mydict = {"search_text": text, "result": result_tfidf_list, "type": "tfidf"}
    searchresults.insert_one(mydict)

    # print("ok 7")

    # mydict = {"search_text": text, "result": result_exact_list, "type": "exact"}
    # searchresults.insert_one(mydict)

    # print("ok 8")

    print("ok")
    sys.stdout.flush()

Exemple #20

0

Afficher le fichier

Fichier : idf_search.py Projet : sepidmnorozy/backup-crawler

from tfidf import get_tfidt_vector

client = MongoClient()
db = client['newsdb']
search_text = db.searches

articles = db.articles

search_result = db.searchresults

text = search_text.find().sort("_id", -1)[1000]["text"]

search_text = preprocess(text)
dic = {"preprocessed_text": search_text}
search_v_w2v = get_word2vec(dic)
search_v_tfidf = get_tfidt_vector(dic)


def similarity(vec, other_vec):
    dot = np.dot(vec, other_vec)
    norma = np.linalg.norm(vec)
    normb = np.linalg.norm(other_vec)
    cos = dot / (norma * normb)
    return cos


wanted_news = []

# for res in result:
#     dic = {"text": "", "similar_texts": []}
#     dic["text"] = res[0]

Exemple #21

0

Afficher le fichier

Fichier : spider11.py Projet : sepidmnorozy/backup-crawler

    def parse(self, response):

        dic = {
            "title": " ",
            "timestamp": " ",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        title = response.xpath('//h1[@class="title"]/a/text()').get()
        dic["title"] = title

        news_url = response.css('h1[class=title] a::attr(href)').extract()[0]
        dic["url"] = "https://www.mehrnews.com" + news_url

        sections = response.xpath(
            '//div[@class="col-6 col-sm-4"]/ol[@class="breadcrumb"]/li/a/text()'
        ).getall()
        dic["article_section"] = sections

        summary = response.xpath(
            '//p[@class="summary introtext"]/text()').get()
        dic["summary"] = summary

        date = response.xpath(
            '//div[@class="col-6 col-sm-4 item-date"]/span/text()').get()
        date_list = date.split(' ')
        # print(date_list)
        timelist = date_list[4].split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        minute = convert_persian_to_english_numbers(timelist[1])
        # print("hour")
        # print(hour)
        # print("minute")
        # print(minute)
        #
        day = convert_persian_to_english_numbers(date_list[0])
        # print("day")
        # print(day)
        #
        month = month_dic[date_list[1]]
        # print("month")
        # print(month)
        #
        yearlist = date_list[2].split('،')
        # print(yearlist)
        year = convert_persian_to_english_numbers(yearlist[0])
        # print("year")
        # print(year)
        # # #
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        # print(jalili_date)
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))
        # print(datetime_object)

        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath('//div[@class="item-code"]/span/text()').get()
        dic["code"] = code

        tags = response.xpath(
            '//section[@class="box tags"]/div/ul/li/a/text()').getall()
        dic["tags"] = tags

        text_list = response.xpath(
            '//div[@class="item-text"]/p/text()').getall()
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Exemple #22

0

Afficher le fichier

Fichier : searchtest.py Projet : sepidmnorozy/backup-crawler

def search(text):
    try:
        client = MongoClient()
        src_db = client['newsdb_week']
        #articles = src_db.searches
        dst_db = client['webdb']
        articles = src_db.weekarticles
        searchresults = dst_db.searchresults
        #text = "بازگشایی مدارس تهران"
        #print(text)
        search_text = preprocess(text)
        search_text_tokens = search_text.split(' ')
        dic = {"preprocessed_text": search_text}
        search_v_w2v = get_word2vec(dic)
        search_v_tfidf = get_tfidt_vector(dic)
        result_w2v_list = []
        result_tfidf_list = []
        result_exact_list = []
        now = datetime.datetime.now()
        count = 0
        for a in articles.find(
            {"timestamp": {
                "$gt": now.timestamp() - 5184000.0
            }}):
            count = count + 1
            #for a in articles.find({}):
            if not np.all(search_v_w2v == 0):
                if not np.all(np.array(a["w2v"]) == 0):
                    if similarity(np.array(a["w2v"]), search_v_w2v) > 0.8:
                        result_w2v_list.append(a)
                if not np.all(np.array(a["tfidf"]) == 0):
                    if similarity(np.array(a["tfidf"]), search_v_tfidf) > 0.3:
                        result_tfidf_list.append(a)
            # for token in search_text_tokens:
            #     if token in a["text"]:
            #         result_exact_list.append(a)
            #         break
        # print("num of documents checked : ")
        # print(count)
        print(count)

        searchresults.delete_many({})
        #
        mydict = {
            "search_text": text,
            "result": result_w2v_list,
            "type": "w2v"
        }
        searchresults.insert_one(mydict)
        #
        mydict = {
            "search_text": text,
            "result": result_tfidf_list,
            "type": "tfidf"
        }
        searchresults.insert_one(mydict)
        #
        # mydict = {"search_text": text, "result": result_exact_list, "type": "exact"}
        # searchresults.insert_one(mydict)
        #
        # mydict = {"status": "done"}
        # searchstatus.insert_one(mydict)

        print("OK")
        print(len(result_tfidf_list))
        print(len(result_w2v_list))
        s = str(len(result_tfidf_list)) + str(len(result_w2v_list))
        return s
    except Exception as e:
        print(e)
        print("exception occured")
        return str(e)

Exemple #23

0

Afficher le fichier

Fichier : spider18.py Projet : sepidmnorozy/backup-crawler

    def parse(self, response):

        dic = {
            "title": " ",
            "timestamp": "",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        title = response.xpath('//h1[@class="title"]/a/text()').get()
        dic["title"] = title

        news_url = response.css('h1[class=title] a::attr(href)').extract()[0]
        dic["url"] = "http://www.shafaf.ir" + news_url

        sections = []
        dic["article_section"] = sections

        summary = response.xpath('//p[@itemprop="description"]/text()').get()
        dic["summary"] = summary

        date = response.xpath(
            '//div[@class="news_nav news_pdate_c col-sm-16 col-xs-36"]/text()'
        ).getall()
        date = date[1]
        date_list = date.split(' ')
        timelist = date_list[4].split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        minute = convert_persian_to_english_numbers(timelist[1])
        day = convert_persian_to_english_numbers(date_list[0])
        month = month_dic[date_list[1]]
        year = convert_persian_to_english_numbers(date_list[2])
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))
        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath(
            '//div[@class="news_nav news_id_c"]/text()').get()
        # code = processed_text = " ".join(code.split())
        # code_list = code.split(' ')
        dic["code"] = code

        tags = []
        dic["tags"] = tags

        text_list1 = response.xpath('//div[@class="body"]/p/text()').getall()
        if len(text_list1) == 0:
            # item-text
            text_list2 = response.xpath(
                '//div[@class="body"]/div[@class="item-text"]/p/text()'
            ).getall()
            text_list = text_list2
        else:
            text_list = text_list1

        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Exemple #24

0

Afficher le fichier

Fichier : spider3.py Projet : sepidmnorozy/backup-crawler

    def parse(self, response):
        dic = {
            "timestamp": "",
            "title": " ",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        title = response.xpath('//h1[@class="title"]/a/text()').get()
        dic["title"] = title
        dic["preprocessed_title"] = preprocess(dic["title"])

        news_url = response.css('h1[class=title] a::attr(href)').extract()
        if len(news_url) > 0:
            news_url = news_url[0]
        dic["url"] = "https://www.hamshahrionline.ir" + news_url

        sections = response.xpath(
            '//li[@class="breadcrumb-item"]/a/text()').getall()
        dic["article_section"] = sections[1:]

        summary = response.xpath('//p[@class="introtext"]/text()').get()
        dic["summary"] = summary
        dic["preprocessed_summary"] = preprocess(dic["summary"])

        date = response.xpath(
            '//div[@class="col-6 col-sm-4 col-xl-4 item-date"]/span/text()'
        ).get()
        list = date.split(' ')
        # print(list)
        day = convert_persian_to_english_numbers(list[1])
        month = month_dic[list[2]]
        year = convert_persian_to_english_numbers(list[3])
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        time = list[5]
        # print(convert_persian_to_english_numbers(day))
        # print(month_dic[month])
        # print(convert_persian_to_english_numbers(year))

        list_time = time.split(':')
        hour = convert_persian_to_english_numbers(list_time[0])
        minute = convert_persian_to_english_numbers(list_time[1])
        # print(convert_persian_to_english_numbers(hour))
        # print(convert_persian_to_english_numbers(minute))
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))

        dic["date"] = str(datetime_object)

        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath('//div[@class="item-code"]/span/text()').get()
        dic["code"] = code

        tags = response.xpath(
            '//section[@class="box tags"]/div/ul/li/a/text()').getall()
        dic["tags"] = tags

        text_parts = response.xpath(
            '//div[@class="item-text"]/p/text()').getall()

        text = ""
        for text_part in text_parts:
            text += text_part

        dic["text"] = text

        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb']
        articles = db.weekarticles
        result = articles.insert_one(dic)