Ejemplos de get_word2vec en Python, ejemplos de word2vec.get_word2vec en Python

Ejemplo n.º 1

0

Mostrar archivo

def search():
    try:
        searches = db.searches
        articles = db.articles
        searchresults = db.searchresults
        searchstatus = db.searchstatus
        text = sys.argv[1]
        print(text)
        search_text = preprocess(text)
        search_text_tokens = search_text.split(' ')
        dic = {"preprocessed_text": search_text}
        search_v_w2v = get_word2vec(dic)
        search_v_tfidf = get_tfidt_vector(dic)
        result_w2v_list = []
        result_tfidf_list = []
        result_exact_list = []
        now = datetime.datetime.now()
        count = 0
        for a in articles.find(
            {"timestamp": {
                "$gt": now.timestamp() - 86400.0
            }}):
            # count = 0
            # for a in articles.find():
            count += 1
            if not np.all(search_v_w2v == 0):
                if not np.all(np.array(a["w2v"]) == 0):
                    if similarity(np.array(a["w2v"]), search_v_w2v) > 0.3:
                        result_w2v_list.append(a)
                if not np.all(np.array(a["tfidf"]) == 0):
                    if similarity(np.array(a["tfidf"]), search_v_tfidf) > 0.0:
                        result_tfidf_list.append(a)
            # for token in search_text_tokens:
            #     if token in a["text"]:
            #         result_exact_list.append(a)
            #         break
        # print("num of documents checked : ")
        # print(count)

        # searchresults.delete_many({})
        #
        # mydict = {"search_text": text, "result": result_w2v_list, "type": "w2v"}
        # searchresults.insert_one(mydict)
        #
        # mydict = {"search_text": text, "result": result_tfidf_list, "type": "tfidf"}
        # searchresults.insert_one(mydict)
        #
        # mydict = {"search_text": text, "result": result_exact_list, "type": "exact"}
        # searchresults.insert_one(mydict)
        #
        # mydict = {"status": "done"}
        # searchstatus.insert_one(mydict)

        print(len(result_tfidf_list))
        print(len(result_w2v_list))
        sys.stdout.flush()

    except (Exception):
        print("exception occured")
        sys.stdout.flush()

Ejemplo n.º 2

0

Mostrar archivo

def add_vector_for_old_news():
    client = MongoClient()
    db = client['newsdb']
    articles = db.articles
    # count = 0
    # b = {}
    # a = {"id":1}
    # b = a
    # print(b)
    # b["text"] = "salam"
    # print(b)
    all_news_list = []
    count_of_ah = 0
    for a in articles.find():
        b = a
        # if "title" in a :
        #     b["preprocessed_title"] = preprocess(a["title"])
        if "summary" in a:
            b["preprocessed_summary"] = preprocess(a["summary"])

        w2v_vector = get_word2vec(b).tolist()
        tfidf_vector = get_tfidt_vector(b).tolist()
        b["w2v"] = w2v_vector
        b["tfidf"] = tfidf_vector
        all_news_list.append(b)
        print("done")

    print("first step done")
    articles.delete_many({})
    print("second step done")
    for dic in all_news_list:
        articles.insert_one(dic)
    print("third step done")

Ejemplo n.º 3

0

Mostrar archivo

    def parse(self, response):

        dic = {"title":" ", "timestamp": "", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " "}

        title = response.xpath('//h1[@class="title"]/text()').get()
        dic["title"] = title

        news_url = response.xpath('//a[@id="short-link"]/text()').get()
        dic["url"] = " ".join(news_url.split())

        sections = []
        dic["article_section"] = sections

        summary = response.xpath('//h3[@class="lead"]/text()').get()
        dic["summary"] = summary

        date = response.xpath('//li[@class="time"]/text()').get()
        date_list = date.split(' ')
        timelist = date_list[5].split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        minute = convert_persian_to_english_numbers(timelist[1])
        day = convert_persian_to_english_numbers(date_list[1])
        month = month_dic[date_list[2]]
        year = convert_persian_to_english_numbers(date_list[3])
        jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian()
        datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour),
                                            int(minute))
        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = "no code"
        dic["code"] = code

        tags = []
        dic["tags"] = tags

        text_list = response.xpath('//div[@class="story"]/p/text()').getall()
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: spider7.py Proyecto: sepidmnorozy/backup-crawler

    def parse(self, response):
        HtmlResponse = response
        # resfile = open('resfile_specific.html', 'w')
        # resfile.write(str(HtmlResponse.body.decode('utf-8')))
        # resfile.close()

        dic = {
            "title": " ",
            "timestamp": "",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        title = response.xpath('//h1[@class="title"]/a/text()').get()
        dic["title"] = title
        dic["preprocessed_title"] = preprocess(dic["title"])

        news_url = response.css('h1[class=title] a::attr(href)').extract()[0]
        dic["url"] = "http://titrnews.ir" + news_url

        sections = response.xpath(
            '//div[@class="news_path"]/a/text()').getall()
        dic["article_section"] = sections

        summary = response.xpath('//div[@class="subtitle"]/text()').get()
        dic["summary"] = summary
        dic["preprocessed_summary"] = preprocess(dic["summary"])

        date = response.xpath(
            '//div[@class="news_nav news_pdate_c"]/text()').get()
        date_list = date.split(' ')
        # print(date_list)
        timelist = date_list[4].split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        minute = convert_persian_to_english_numbers(timelist[1])
        # print("hour")
        # print(hour)
        # print("minute")
        # print(minute)
        #
        #
        day = convert_persian_to_english_numbers(date_list[0])
        # print("day")
        # print(day)
        #
        month = month_dic[date_list[1]]
        # print("month")
        # print(month)
        #
        #
        year = convert_persian_to_english_numbers(date_list[2])
        # print("year")
        # print(year)
        #
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        # print(jalili_date)
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))
        # print(datetime_object)

        #
        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath(
            '//div[@class="news_nav news_id_c"]/text()').get()
        dic["code"] = code

        tags = response.xpath('//div[@class="tags_title"]/a/text()').getall()
        dic["tags"] = tags

        text_list = response.xpath(
            '//div[@class="body"]/p/span/text()').getall()
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Ejemplo n.º 5

0

Mostrar archivo

    def parse(self, response):

        dic = {"title":" ", "timestamp": " ", "url": " ", "date": " ", "text": " ", "summary": " ", "tags": [], "article_section": " ", "code": " "}

        title = response.xpath('//span[@class="title mb-2 d-block text-justify"]/text()').get()
        dic["title"] = title

        news_url = response.css('link[rel=canonical]::attr(href)').extract()[0]
        dic["url"] = news_url

        sections = response.xpath('//div[@class="category-name d-flex justify-content-center"]/span/a/text()').getall()
        final = []
        for s in sections:
            processed_text = " ".join(s.split())
            final.append(processed_text)
        dic["article_section"] = final

        summary = response.xpath('//p[@class="lead p-2 text-justify"]/text()').get()
        dic["summary"] = summary

        date = response.xpath('//div[@class="publish-time d-flex justify-content-center"]/span/text()').getall()
        if len(date) > 1:
            timelist = date[0].split(':')
            hour = convert_persian_to_english_numbers(timelist[0])
            minute = convert_persian_to_english_numbers(timelist[1])

            date_list = date[2].split('/')

            day = convert_persian_to_english_numbers(date_list[2])

            month = convert_persian_to_english_numbers(date_list[1])

            yearlist = date_list[0].split('،')

            year = convert_persian_to_english_numbers(yearlist[0])

            jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian()

            datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour),
                                                int(minute))

            dic["date"] = str(datetime_object)
        else:
            date = response.xpath('//span[@class="publish-time text-center"]/text()').get()
            date_list = date.split(' ')

            timelist = date_list[2].split(':')
            hour = convert_persian_to_english_numbers(timelist[0])
            minute = convert_persian_to_english_numbers(timelist[1])

            d_list = date_list[0].split('/')

            day = convert_persian_to_english_numbers(d_list[2])

            month = convert_persian_to_english_numbers(d_list[1])

            year = convert_persian_to_english_numbers(d_list[0])

            jalili_date = jdatetime.date(int(year), int(month), int(day)).togregorian()

            datetime_object = datetime.datetime(jalili_date.year, jalili_date.month, jalili_date.day, int(hour),
                                                int(minute))

            dic["date"] = datetime_object

        #
        #
        dic["timestamp"] = datetime_object.timestamp()
        code = ''
        dic["code"] = code

        tags = response.xpath('//div[@class="tags mt-4 text-right d-flex flex-wrap"]/a/text()').getall()
        finaltags = []
        for t in tags:
            processed_text = " ".join(t.split())
            finaltags.append(processed_text)
        dic["tags"] = finaltags
        #
        text_list = response.xpath('//div[@class="nt-body text-right mt-4"]/p/text()').getall()
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: spider3.py Proyecto: sepidmnorozy/backup-crawler

    def parse(self, response):
        dic = {
            "timestamp": "",
            "title": " ",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        title = response.xpath('//h1[@class="title"]/a/text()').get()
        dic["title"] = title
        dic["preprocessed_title"] = preprocess(dic["title"])

        news_url = response.css('h1[class=title] a::attr(href)').extract()
        if len(news_url) > 0:
            news_url = news_url[0]
        dic["url"] = "https://www.hamshahrionline.ir" + news_url

        sections = response.xpath(
            '//li[@class="breadcrumb-item"]/a/text()').getall()
        dic["article_section"] = sections[1:]

        summary = response.xpath('//p[@class="introtext"]/text()').get()
        dic["summary"] = summary
        dic["preprocessed_summary"] = preprocess(dic["summary"])

        date = response.xpath(
            '//div[@class="col-6 col-sm-4 col-xl-4 item-date"]/span/text()'
        ).get()
        list = date.split(' ')
        # print(list)
        day = convert_persian_to_english_numbers(list[1])
        month = month_dic[list[2]]
        year = convert_persian_to_english_numbers(list[3])
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        time = list[5]
        # print(convert_persian_to_english_numbers(day))
        # print(month_dic[month])
        # print(convert_persian_to_english_numbers(year))

        list_time = time.split(':')
        hour = convert_persian_to_english_numbers(list_time[0])
        minute = convert_persian_to_english_numbers(list_time[1])
        # print(convert_persian_to_english_numbers(hour))
        # print(convert_persian_to_english_numbers(minute))
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))

        dic["date"] = str(datetime_object)

        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath('//div[@class="item-code"]/span/text()').get()
        dic["code"] = code

        tags = response.xpath(
            '//section[@class="box tags"]/div/ul/li/a/text()').getall()
        dic["tags"] = tags

        text_parts = response.xpath(
            '//div[@class="item-text"]/p/text()').getall()

        text = ""
        for text_part in text_parts:
            text += text_part

        dic["text"] = text

        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: spider20.py Proyecto: sepidmnorozy/backup-crawler

    def parse(self, response):

        dic = {"title": " "}

        title = response.xpath(
            '//h1[@class="title h_title_news"]/a/text()').get()
        dic["title"] = title

        try:
            news_url = response.css(
                'h1.h_title_news a::attr(href)').extract()[0]
        except (Exception):
            news_url = response.css('h1.Htags a::attr(href)').extract()[0]

        dic["url"] = "https://namehnews.com" + news_url

        # news_path
        sections = response.xpath(
            '//div[@class="news_path"]/a/text()').getall()
        dic["article_section"] = sections

        summary = response.xpath(
            '//div[@class="subtitle sub_news"]/text()').get()
        dic["summary"] = summary

        date = response.xpath(
            '//div[@class="news_nav news_pdate_c col-xs-36 col-sm-14 pull-left"]/text()'
        ).getall()
        date = " ".join(date[1].split())

        date_list = date.split(' ')
        timelist = date_list[4].split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        minute = convert_persian_to_english_numbers(timelist[1])

        day = convert_persian_to_english_numbers(date_list[0])
        month = month_dic[date_list[1]]
        year = convert_persian_to_english_numbers(date_list[2])
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))

        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath(
            '//div[@class="news_nav news_id_c col-xs-36 col-sm-11"]/text()'
        ).getall()
        code = " ".join(code[1].split())
        dic["code"] = code

        tags = response.xpath('//div[@class="tags_title"]/a/text()').getall()
        dic["tags"] = tags

        text_list = response.xpath(
            '//div[@class="body body_news"]/div/text()').getall()
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: spider11.py Proyecto: sepidmnorozy/backup-crawler

    def parse(self, response):

        dic = {
            "title": " ",
            "timestamp": " ",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        title = response.xpath('//h1[@class="title"]/a/text()').get()
        dic["title"] = title

        news_url = response.css('h1[class=title] a::attr(href)').extract()[0]
        dic["url"] = "https://www.mehrnews.com" + news_url

        sections = response.xpath(
            '//div[@class="col-6 col-sm-4"]/ol[@class="breadcrumb"]/li/a/text()'
        ).getall()
        dic["article_section"] = sections

        summary = response.xpath(
            '//p[@class="summary introtext"]/text()').get()
        dic["summary"] = summary

        date = response.xpath(
            '//div[@class="col-6 col-sm-4 item-date"]/span/text()').get()
        date_list = date.split(' ')
        # print(date_list)
        timelist = date_list[4].split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        minute = convert_persian_to_english_numbers(timelist[1])
        # print("hour")
        # print(hour)
        # print("minute")
        # print(minute)
        #
        day = convert_persian_to_english_numbers(date_list[0])
        # print("day")
        # print(day)
        #
        month = month_dic[date_list[1]]
        # print("month")
        # print(month)
        #
        yearlist = date_list[2].split('،')
        # print(yearlist)
        year = convert_persian_to_english_numbers(yearlist[0])
        # print("year")
        # print(year)
        # # #
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        # print(jalili_date)
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))
        # print(datetime_object)

        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath('//div[@class="item-code"]/span/text()').get()
        dic["code"] = code

        tags = response.xpath(
            '//section[@class="box tags"]/div/ul/li/a/text()').getall()
        dic["tags"] = tags

        text_list = response.xpath(
            '//div[@class="item-text"]/p/text()').getall()
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: search.py Proyecto: sepidmnorozy/newsFiles

def search():
    client = MongoClient()
    src_db = client['newsdb']
    articles = src_db.articles
    dst_db = client['webdb']
    searchresults = dst_db.searchresults
    # text = searches.find().sort("_id", -1)[0]["text"]
    text = sys.argv[1]
    # print("this is the text that python received")
    # print(text)

    search_text = preprocess(text)

    search_text_tokens = search_text.split(' ')
    # print("ok 1")
    dic = {"preprocessed_text": search_text}
    search_v_w2v = get_word2vec(dic)
    # print("ok 2")
    search_v_tfidf = get_tfidt_vector(dic)
    # print("ok 3")

    now = datetime.datetime.now()
    result_w2v_list = []
    result_tfidf_list = []
    result_exact_list = []

    # for a in articles.find({"timestamp": {"$gt": now.timestamp() - 86400.0}}):
    count = 0
    count_tokens = 0

    for a in articles.find({"timestamp": {"$gt": now.timestamp() - 86400.0}}):
        count += 1
        if not np.all(search_v_w2v == 0):
            if not np.all(np.array(a["w2v"]) == 0):
               if similarity(np.array(a["w2v"]), search_v_w2v) > 0.8:
                   result_w2v_list.append(a)
            if not np.all(np.array(a["tfidf"]) == 0):
               if similarity(np.array(a["tfidf"]), search_v_tfidf) > 0.7:
                   result_tfidf_list.append(a)
            # for token in search_text_tokens:
            #     if token in a["text"]:
            #         count_tokens += 1
            #         if count_tokens == len(search_text_tokens):
            #             result_exact_list.append(a)


    # print("ok 4")
    searchresults.delete_many({})
    # print("ok 5")
    mydict = {"search_text": text, "result": result_w2v_list, "type": "w2v"}
    searchresults.insert_one(mydict)

    # print("ok 6")

    mydict = {"search_text": text, "result": result_tfidf_list, "type": "tfidf"}
    searchresults.insert_one(mydict)

    # print("ok 7")

    # mydict = {"search_text": text, "result": result_exact_list, "type": "exact"}
    # searchresults.insert_one(mydict)

    # print("ok 8")

    print("ok")
    sys.stdout.flush()

Ejemplo n.º 10

0

Mostrar archivo

    def parse(self, response):

        dic = {"title": " "}

        title = response.xpath(
            '//h1[@class="title title_news"]/span/text()').get()
        dic["title"] = title

        news_url = response.css('h1.title_news span::attr(href)').extract()[0]
        dic["url"] = "https://www.iribnews.ir" + news_url

        sections = response.xpath(
            '//div[@class="news_path"]/a/text()').getall()
        dic["article_section"] = sections

        summary = response.xpath('//p[@class="subtitle"]/text()').get()
        dic["summary"] = summary

        date = response.xpath(
            '//div[@class="news_nav news_pdate_c col-sm-16 col-xs-25"]/text()'
        ).getall()
        # dic["date"] = date
        # print("date:")
        # print(block["date"][1])
        date = date[1]
        date_list = date.split(' ')
        # print(date_list)
        timelist = date_list[21].split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        # print(hour)
        minute = convert_persian_to_english_numbers(timelist[1])
        # print(minute)
        #
        day = convert_persian_to_english_numbers(date_list[17])
        # print(day)
        month = month_dic[date_list[18]]
        # print(month)
        year = convert_persian_to_english_numbers(date_list[19])
        # print(year)
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))
        # print(datetime_object)
        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath(
            '//div[@class="news_nav news_id_c col-sm-10  col-xs-11"]/text()'
        ).getall()
        code = " ".join(code[1].split())
        dic["code"] = code

        tags = response.xpath('//div[@class="tags_title"]/a/text()').getall()
        dic["tags"] = tags

        text_list = response.xpath(
            '//div[@class="body body_media_content_show"]/div/text()').getall(
            )
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Ejemplo n.º 11

0

Mostrar archivo

Archivo: spider2.py Proyecto: sepidmnorozy/backup-crawler

    def parse(self, response):
        dic = {
            "timestamp": "",
            "title": " ",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        title = response.xpath('//h1[@class="Htags"]/a/text()').get()
        dic["title"] = title
        dic["preprocessed_title"] = preprocess(dic["title"])

        news_url = response.css('h1[class=Htags] a::attr(href)').extract()
        if len(news_url) > 0:
            news_url = news_url[0]
        dic["url"] = "https://www.yjc.ir" + news_url

        sections = response.xpath(
            '//div[@class="news_path"]/a/text()').getall()
        dic["article_section"] = sections

        summary = response.xpath('//strong[@class="news_strong"]/text()').get()
        dic["summary"] = summary
        dic["preprocessed_summary"] = preprocess(dic["summary"])

        date_list = response.xpath(
            '//div[@class="news_nav news_pdate_c"]/text()').getall()
        date = ""
        for d in date_list:
            date += d
        list = date.split(' ')
        # print(list)
        day = convert_persian_to_english_numbers(list[0])
        month = month_dic[list[1]]
        year = convert_persian_to_english_numbers(list[2])
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        time = list[4]
        # print(convert_persian_to_english_numbers(day))
        # print(month_dic[month])
        # print(convert_persian_to_english_numbers(year))

        list_time = time.split(':')
        hour = convert_persian_to_english_numbers(list_time[0])
        minute = convert_persian_to_english_numbers(list_time[1])
        # print(convert_persian_to_english_numbers(hour))
        # print(convert_persian_to_english_numbers(minute))
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))

        dic["date"] = str(datetime_object)

        dic["timestamp"] = datetime_object.timestamp()

        code_list = response.xpath(
            '//div[@class="news_nav news_id_c"]/text()').getall()
        code = ""
        for c in code_list:
            code += c
        dic["code"] = code

        tags = response.xpath('//div[@class="tag_items"]/a/text()').getall()
        dic["tags"] = tags

        text_parts = response.xpath('//div[@class="body"]/p/text()').getall()

        text = ""
        for text_part in text_parts:
            text += text_part

        dic["text"] = text

        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: spider6.py Proyecto: sepidmnorozy/backup-crawler

    def parse(self, response):
        HtmlResponse = response
        # resfile = open('resfile_specific.html', 'w')
        # resfile.write(str(HtmlResponse.body.decode('utf-8')))
        # resfile.close()

        dic = {
            "title": " ",
            "timestamp": "",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        title = response.xpath('//h1[@id="docDiv3TitrMain"]/text()').get()
        dic["title"] = title
        dic["preprocessed_title"] = preprocess(dic["title"])

        news_url = response.css('a[class=print-icon]::attr(href)').extract()[0]
        dic["url"] = "http://www.akhbarbank.com" + news_url.replace(
            "/print", "")

        sections = response.xpath(
            '//div[@class="dsinfo-p1-active"]/a/text()').getall()
        dic["article_section"] = sections

        summary = response.xpath('//div[@id="docDivLead3"]/div/text()').get()
        dic["summary"] = summary
        dic["preprocessed_summary"] = preprocess(dic["summary"])

        date = response.xpath('//div[@id="docDiv3Date"]/text()').get()
        date_list = date.split(' ')
        # print(date_list)
        timelist = date_list[5].split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        minute = convert_persian_to_english_numbers(timelist[1])
        # print("hout")
        # print(hour)
        # print("minute")
        # print(minute)

        day = convert_persian_to_english_numbers(date_list[1])
        # print("day")
        # print(day)

        month = month_dic[date_list[2]]
        # print("month")
        # print(month)

        year = convert_persian_to_english_numbers(date_list[3])
        # print("year")
        # print(year)

        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        # print(jalili_date)
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))
        # print(datetime_object)

        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath('//div[@id="docDocID"]/text()').get()
        dic["code"] = code

        tags = []
        dic["tags"] = tags

        text_list = response.xpath('//div[@id="doctextarea"]/text()').getall()
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: spider5.py Proyecto: sepidmnorozy/backup-crawler

    def parse(self, response):
        HtmlResponse = response
        # resfile = open('resfile_specific.html', 'w')
        # resfile.write(str(HtmlResponse.body.decode('utf-8')))
        # resfile.close()

        dic = {
            "title": " ",
            "timestamp": "",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }
        title = response.xpath('//h1[@class="title"]/a/text()').get()
        dic["title"] = title
        dic["preprocessed_title"] = preprocess(dic["title"])
        news_url = response.css('h1[class=title] a::attr(href)').extract()
        if len(news_url) > 0:
            news_url = news_url[0]
        dic["url"] = "http://sobhanehonline.com" + news_url

        sections = response.xpath(
            '//div[@class="news_path"]/a/text()').getall()
        if len(sections) > 0:
            dic["article_section"] = sections[0]

        summary = response.xpath('//div[@class="subtitle"]/text()').get()
        dic["summary"] = summary
        dic["preprocessed_summary"] = preprocess(dic["summary"])

        date = response.xpath(
            '//div[@class="news_nav news_pdate_c"]/text()').get()
        list = date.split(' ')
        # print(list)
        day = convert_persian_to_english_numbers(list[0])
        month = month_dic[list[1]]
        year = convert_persian_to_english_numbers(list[2])
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        time = list[4]
        # print(convert_persian_to_english_numbers(day))
        # print(month_dic[month])
        # print(convert_persian_to_english_numbers(year))

        list_time = time.split(':')
        hour = convert_persian_to_english_numbers(list_time[0])
        minute = convert_persian_to_english_numbers(list_time[1])
        # print(convert_persian_to_english_numbers(hour))
        # print(convert_persian_to_english_numbers(minute))
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))

        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath(
            '//div[@class="news_nav news_id_c"]/text()').get()
        dic["code"] = code

        tags = response.xpath('//a[@class="tags_item"]/text()').getall()
        dic["tags"] = tags

        text_parts = response.xpath('//div[@align="justify"]/text()').getall()

        text = ""
        for text_part in text_parts:
            text += text_part

        if (len(text) < 1):
            maybe_text = response.xpath('//div[@class="body"]/text()').getall()
            for t in maybe_text:
                text += t
            maybe_p = response.xpath('//div[@class="body"]/p/text()').getall()
            for p in maybe_p:
                text += p

        dic["text"] = text
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Ejemplo n.º 14

0

Mostrar archivo

Archivo: lstm.py Proyecto: ds-hwang/deeplearning_udacity

def lstm(params):
    data, count, dictionary, embeddings, normalized_embeddings, weights, biases = word2vec.get_word2vec(
        2, False)
    words_size = embeddings.shape[0]
    embedding_size = embeddings.shape[1]
    print('Most common words (+UNK)', count[:5])
    print('embedding size:%s data:%s' %
          (embedding_size, [dictionary[word] for word in data[:100]]))

    # Create a small validation set.
    valid_size = 1000
    valid_text = data[:valid_size]
    train_text = data[valid_size:]
    train_size = len(train_text)

    p_num_unrollings = params['num_unrollings']
    p_batch_size = params['batch_size']

    class BatchGenerator(object):
        def __init__(self, text, batch_size, num_unrollings):
            assert batch_size >= 1
            assert num_unrollings >= 1
            self._text = text
            self._text_size = len(text)
            self._batch_size = batch_size
            self._num_unrollings = num_unrollings
            segment = self._text_size // batch_size
            self._cursor_boundary = [
                offset * segment for offset in range(batch_size)
            ]
            self._cursor = self._cursor_boundary[:]
            self._last_batch = self._next_batch()

        def _next_batch(self):
            """Generate a single batch from the current cursor position in the data."""
            batch = np.zeros(shape=(self._batch_size, embedding_size),
                             dtype=np.float)
            for b in range(self._batch_size):
                batch[b] = normalized_embeddings[self._text[self._cursor[b]]]
                self._cursor[b] = (self._cursor[b] + 1)
            if self._cursor[self._batch_size - 1] == self._text_size:
                self._cursor = self._cursor_boundary[:]
            return batch

        def next(self):
            """Generate the next array of batches from the data. The array consists of
            the last batch of the previous array, followed by p_num_unrollings new ones.
            """
            batches = [self._last_batch]
            for _ in range(self._num_unrollings):
                batches.append(self._next_batch())
            self._last_batch = batches[-1]
            return batches

    def batches2string(batches):
        """Convert a sequence of batches back into their (most likely) string
        representation."""
        s = [''] * batches[0].shape[0]
        for b in batches:
            words = [
                dictionary[w]
                for w in np.argmax(np.matmul(b, normalized_embeddings.T), 1)
            ]
            s = [' '.join(x) for x in zip(s, words)]
        return s

    train_batches = BatchGenerator(train_text, p_batch_size, p_num_unrollings)
    valid_batches = BatchGenerator(valid_text, 1, 1)

    print(batches2string(train_batches.next()))
    print(batches2string(train_batches.next()))
    print(batches2string(train_batches.next()))
    print(batches2string(valid_batches.next()))
    print(batches2string(valid_batches.next()))
    print(batches2string(valid_batches.next()))

    def logprob(predictions, labels):
        """Log-probability of the true labels in a predicted batch."""
        predictions[predictions < 1e-10] = 1e-10
        return np.sum(
            -np.log([predictions[i, label]
                     for i, label in enumerate(labels)])) / labels.shape[0]

    graph = tf.Graph()
    with graph.as_default():
        p_num_nodes = params['num_nodes']
        p_max_k = params['max_k']

        def create_trainable_variables():
            '''
            Parameters:
                num_nodes*0:num_nodes*1 : Input gate
                num_nodes*1:num_nodes*2 : Forget gate
                num_nodes*2:num_nodes*3 : Output gate
                num_nodes*3:num_nodes*4 : New memory cell
            '''
            W = {
                'L1_W':
                tf.Variable(
                    tf.truncated_normal([embedding_size, p_num_nodes * 4],
                                        mean=0,
                                        stddev=0.1,
                                        name="L1_W")),
                'L1_U':
                tf.Variable(
                    tf.truncated_normal([p_num_nodes, p_num_nodes * 4],
                                        mean=0,
                                        stddev=0.1,
                                        name="L1_U")),
                'L1_b':
                tf.Variable(tf.zeros([1, p_num_nodes * 4]), name="L1_b"),
                'L2_W':
                tf.Variable(
                    tf.truncated_normal([p_num_nodes, p_num_nodes * 4],
                                        mean=0,
                                        stddev=0.1,
                                        name="L2_W")),
                'L2_U':
                tf.Variable(
                    tf.truncated_normal([p_num_nodes, p_num_nodes * 4],
                                        mean=0,
                                        stddev=0.1,
                                        name="L2_U")),
                'L2_b':
                tf.Variable(tf.zeros([1, p_num_nodes * 4]), name="L2_b"),
                'L3_W':
                tf.Variable(
                    tf.truncated_normal([p_num_nodes, p_num_nodes * 4],
                                        mean=0,
                                        stddev=0.1,
                                        name="L3_W")),
                'L3_U':
                tf.Variable(
                    tf.truncated_normal([p_num_nodes, p_num_nodes * 4],
                                        mean=0,
                                        stddev=0.1,
                                        name="L3_U")),
                'L3_b':
                tf.Variable(tf.zeros([1, p_num_nodes * 4]), name="L3_b"),
                'L4_W':
                tf.Variable(
                    tf.truncated_normal([p_num_nodes, embedding_size],
                                        mean=0,
                                        stddev=0.1,
                                        name="L4_W")),
                'L4_b':
                tf.Variable(tf.zeros([embedding_size]), name="L4_b"),
            }

            return W

        def create_variables(batch_size, num_unrollings):
            # Input data.
            train_data = list()
            for _ in range(num_unrollings + 1):
                train_data.append(
                    tf.placeholder(tf.float32,
                                   shape=[batch_size, embedding_size]))

            inputs = {
                'inputs': train_data[:num_unrollings],
                'labels':
                train_data[1:],  # labels are inputs shifted by one time step.
                'data': train_data,
                'dropout': tf.placeholder(tf.float32, name="dropout"),
            }

            # Variables saving state across unrollings.
            last_state = {
                'h1':
                tf.Variable(tf.zeros([batch_size, p_num_nodes]),
                            trainable=False,
                            name="h1"),
                'c1':
                tf.Variable(tf.zeros([batch_size, p_num_nodes]),
                            trainable=False,
                            name="c1"),
                'h2':
                tf.Variable(tf.zeros([batch_size, p_num_nodes]),
                            trainable=False,
                            name="h2"),
                'c2':
                tf.Variable(tf.zeros([batch_size, p_num_nodes]),
                            trainable=False,
                            name="c2"),
                'h3':
                tf.Variable(tf.zeros([batch_size, p_num_nodes]),
                            trainable=False,
                            name="h3"),
                'c3':
                tf.Variable(tf.zeros([batch_size, p_num_nodes]),
                            trainable=False,
                            name="c3"),
            }

            return inputs, last_state

        # Definition of the cell computation.
        def lstm_cell(x, h, c, W, U, b):
            """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
            Note that in this formulation, we omit the various connections between the
            previous c (i.e. state) and the gates."""
            raw_data = tf.matmul(x, W) + tf.matmul(h, U) + b
            gates = tf.sigmoid(raw_data[:, :p_num_nodes * 3])
            input_gate = gates[:, :p_num_nodes]  # p_batch_size x p_num_nodes
            forget_gate = gates[:, p_num_nodes:p_num_nodes *
                                2]  # p_batch_size x p_num_nodes
            output_gate = gates[:, p_num_nodes * 2:p_num_nodes *
                                3]  # p_batch_size x p_num_nodes
            new_memory_cell = raw_data[:, p_num_nodes *
                                       3:]  # p_batch_size x p_num_nodes
            c_next = forget_gate * c + input_gate * tf.tanh(
                new_memory_cell)  # p_batch_size x p_num_nodes
            h_next = output_gate * tf.tanh(c_next)
            return h_next, c_next

        def create_model(W, inputs, last_state):
            ys = list()
            h1 = last_state['h1']
            c1 = last_state['c1']
            h2 = last_state['h2']
            c2 = last_state['c2']
            h3 = last_state['h3']
            c3 = last_state['c3']
            # construct 2 layer LSTM
            for x in inputs['inputs']:
                h1, c1 = lstm_cell(x, h1, c1, W['L1_W'], W['L1_U'], W['L1_b'])
                x2 = tf.nn.dropout(h1, inputs['dropout'], name="dropout")
                h2, c2 = lstm_cell(x2, h2, c2, W['L2_W'], W['L2_U'], W['L2_b'])
                x3 = tf.nn.dropout(h2, inputs['dropout'], name="dropout")
                h3, c3 = lstm_cell(x3, h3, c3, W['L3_W'], W['L3_U'], W['L3_b'])
                ys.append(h3)

            # State saving across unrollings.
            with tf.control_dependencies([
                    last_state['h1'].assign(h1), last_state['c1'].assign(c1),
                    last_state['h2'].assign(h2), last_state['c2'].assign(c2),
                    last_state['h3'].assign(h3), last_state['c3'].assign(c3)
            ]):
                # Classifier.
                Y_pred = tf.nn.xw_plus_b(tf.concat(0, ys), W['L4_W'],
                                         W['L4_b'])
                norm = tf.sqrt(
                    tf.reduce_sum(tf.square(Y_pred), 1, keep_dims=True))
                normalized_Y_pred = Y_pred / norm
                Y = tf.concat(0, inputs['labels'])
                l2_loss = params['beta_regularization_value'] * (
                    tf.nn.l2_loss(W['L1_W']) + tf.nn.l2_loss(W['L2_W']) +
                    tf.nn.l2_loss(W['L3_W']) + tf.nn.l2_loss(W['L4_W']))
                loss = tf.contrib.losses.cosine_distance(
                    normalized_Y_pred, Y, dim=1) + l2_loss

            model = {
                'loss': loss,
                'Y_pred': Y_pred,
            }
            return model

        # Convert vec to word
        norm_embeddings = tf.constant(normalized_embeddings.T)

        W = create_trainable_variables()
        inputs, last_state = create_variables(p_batch_size, p_num_unrollings)

        # Unrolled LSTM loop.
        model = create_model(W, inputs, last_state)

        # Optimizer.
        global_step = tf.Variable(0)
        learning_rate = tf.train.exponential_decay(
            params['start_learning_rate'],
            global_step,
            5000,
            0.1,
            staircase=True)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
        gradients, v = zip(*optimizer.compute_gradients(model['loss']))
        gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
        optimizer = optimizer.apply_gradients(zip(gradients, v),
                                              global_step=global_step)

        grad_sum = [
            tf.sqrt(tf.reduce_mean(tf.square(gradient)))
            for gradient in gradients[:len(gradients) - 2]
        ]
        v_sum = [
            tf.sqrt(tf.reduce_mean(tf.square(variable)))
            for variable in v[:len(gradients) - 2]
        ]
        grad_v_sum = [grad / v for grad, v in zip(grad_sum, v_sum)]
        grad_sum_string = tf.Print(grad_sum, [grad_sum], message="grad_sum: ")
        v_sum_string = tf.Print(v_sum, [v_sum], message="v_sum: ")
        grad_v_sum_string = tf.Print(grad_v_sum, [grad_v_sum],
                                     message="grad_v_sum: ")

        # Sampling and validation eval: batch 1, no unrolling.
        sample_batch_size = 1
        sample_num_unrollings = 1
        sample_inputs, sample_last_state = create_variables(
            sample_batch_size, sample_num_unrollings)
        sample_model = create_model(W, sample_inputs, sample_last_state)
        reset_sample_state = tf.group(
            sample_last_state['h1'].assign(
                tf.zeros([sample_batch_size,
                          p_num_nodes])), sample_last_state['c1'].assign(
                              tf.zeros([sample_batch_size, p_num_nodes])),
            sample_last_state['h2'].assign(
                tf.zeros([sample_batch_size,
                          p_num_nodes])), sample_last_state['c2'].assign(
                              tf.zeros([sample_batch_size, p_num_nodes])),
            sample_last_state['h3'].assign(
                tf.zeros([sample_batch_size,
                          p_num_nodes])), sample_last_state['c3'].assign(
                              tf.zeros([sample_batch_size, p_num_nodes])))

        similarity = tf.matmul(sample_model['Y_pred'], norm_embeddings)
        sample_next = tf.nn.top_k(similarity, p_max_k)[1]

        # Add ops to save and restore all the variables.
        saver = tf.train.Saver()

    p_epochs = params['epochs']
    p_summary_frequency = params['summary_frequency']
    with tf.Session(graph=graph) as session:
        tf.initialize_all_variables().run()
        print('Initialized')
        if os.path.exists(params['savefile']) and params['resume']:
            # Restore variables from disk.
            saver.restore(session, params['savefile'])
            print("Model restored.")

        start_time = time.time()
        n_batch = len(data) // p_batch_size
        for epoch in range(int(math.ceil(p_epochs))):
            # p_epochs can be 0.001 to test overfit
            fraction = p_epochs - epoch
            if (fraction) < 1:
                n_batch = n_batch * fraction
            total_step = int(math.ceil(n_batch))
            mean_loss = 0
            print("Epoch %s start / total p_epochs %s, total steps %s" %
                  (epoch, p_epochs, total_step))
            for step in range(total_step):
                batches = train_batches.next()
                inputs_dict = dict()
                for i in range(p_num_unrollings + 1):
                    inputs_dict[inputs['data'][i]] = batches[i]
                inputs_dict[inputs['dropout']] = params['dropout']
                _, loss_e, learning_rate_e = session.run(
                    [optimizer, model['loss'], learning_rate],
                    feed_dict=inputs_dict)
                mean_loss += loss_e
                if step % p_summary_frequency == 0:
                    mean_loss = mean_loss / p_summary_frequency
                    # The mean loss is an estimate of the loss over the last few batches.
                    # PP = exp(CE) = exp(-log(prediction)) = 1/prediction. max PP = 1 / (1/50000) = 50000
                    print(
                        'Average loss at step(%d):%f learning rate:%.2f time:%s'
                        % (step, mean_loss, learning_rate_e,
                           timedelta(seconds=(time.time() - start_time))))
                    mean_loss = 0

                    def sample(candiate_indices):
                        # check https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py#L62
                        k = int(abs(random.normalvariate(
                            0, p_max_k / 2))) % p_max_k
                        index = candiate_indices[k]
                        # Skip UNK
                        if len(candiate_indices) > 1:
                            while index == 0:
                                k = int(
                                    abs(random.normalvariate(
                                        0, p_max_k / 2))) % p_max_k
                                index = candiate_indices[k]
                        return index

                    if step % (p_summary_frequency * 10) == 0:
                        # Generate some samples.
                        print('=' * 80)
                        for _ in range(5):
                            word = int(
                                random.uniform(0, 1) * words_size) % words_size
                            feed = np.array([embeddings[word]])
                            sentence = dictionary[word]
                            reset_sample_state.run()
                            for _ in range(79):
                                prediction = sample_next.eval({
                                    sample_inputs['inputs'][0]:
                                    feed,
                                    sample_inputs['dropout']:
                                    1,
                                })
                                index = sample(prediction[0, :])
                                feed = np.array([embeddings[index]])
                                sentence += ' ' + dictionary[index]
                            print(sentence)
                        print('=' * 80)

                        # Save the variables to disk.
                        save_path = saver.save(session, params['savefile'])

                        # Measure validation set perplexity.
                        valid_mean_loss = 0
                        reset_sample_state.run()
                        for _ in range(valid_size):
                            validation_batches = valid_batches.next()
                            sample_feeds = {
                                sample_inputs['inputs'][0]:
                                validation_batches[0],
                                sample_inputs['labels'][0]:
                                validation_batches[1],
                                sample_inputs['dropout']: 1,
                            }
                            valid_loss = session.run([sample_model['loss']],
                                                     feed_dict=sample_feeds)
                            valid_mean_loss += valid_loss[0]
                        print('Validation set loss: %.2f. saved:%s' %
                              (valid_mean_loss / valid_size, save_path))

Ejemplo n.º 15

0

Mostrar archivo

Archivo: spider13.py Proyecto: sepidmnorozy/newsFiles

    def parse(self, response):

        dic = {
            "title": " ",
            "timestamp": " ",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        title = response.xpath('//h1[@class="title"]/text()').get()
        dic["title"] = title

        news_url = response.css(
            "link[rel='shortlink']::attr(href)").extract()[0]
        dic["url"] = "http://www.rajanews.com" + news_url

        sections = []
        dic["article_section"] = sections

        summary = response.xpath('//div[@class="lead"]/text()').get()
        dic["summary"] = summary

        date = response.xpath('//div[@class="created"]/span/text()').get()
        date_list = date.split(' ')
        timelist = date_list[1].split(':')
        # print(timelist)
        hour = timelist[0]
        minute = timelist[1]
        second = timelist[2]
        date_list = date_list[0].split('-')
        # print(date_list)
        day = date_list[2]
        month = date_list[1]
        year = date_list[0]
        datetime_object = datetime.datetime(int(year), int(month), int(day),
                                            int(hour), int(minute),
                                            int(second))
        # print(datetime_object)
        dic["date"] = datetime_object
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath('//div[@class="news-id"]/text()').get()
        code = processed_text = " ".join(code.split())
        code_list = code.split(' ')
        dic["code"] = code

        tags = []
        dic["tags"] = tags

        text_list = response.xpath('//div[@class="body"]/div/text()').getall()
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: spider9.py Proyecto: sepidmnorozy/backup-crawler

    def parse(self, response):

        dic = {
            "title": " ",
            "timestamp": " ",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        title = response.xpath('//h1[@class="first-title"]/text()').get()
        dic["title"] = title

        news_url = response.css(
            'div[class=form-group] input::attr(value)').extract()[0]
        dic["url"] = news_url

        meta_news = response.xpath(
            '//div[@class="meta-news"]/ul/li/span/text()').getall()

        try:
            dic["article_section"] = meta_news[3]
        except (Exception):
            dic["article_section"] = []

        summary = response.xpath('//p[@class="summary"]/text()').get()
        dic["summary"] = summary

        try:
            date = meta_news[1]
        except (Exception):
            date = response.xpath('//time/text()').get()

        date_list = date.split(' ')
        # print(date_list)
        timelist = date_list[4].split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        minute = convert_persian_to_english_numbers(timelist[1])
        # print("hour")
        # print(hour)
        # print("minute")
        # print(minute)

        day = convert_persian_to_english_numbers(date_list[0])
        # print("day")
        # print(day)
        #
        month = month_dic[date_list[1]]
        # print("month")
        # print(month)
        #

        year = convert_persian_to_english_numbers(date_list[2])
        # print("year")
        # print(year)

        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        # print(jalili_date)
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))
        # print(datetime_object)

        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        try:
            dic["code"] = meta_news[5]
        except:
            dic["code"] = ''

        tags = response.xpath(
            '//footer[@class="tags"]/ul/li/a/text()').getall()
        dic["tags"] = tags

        text_list = response.xpath(
            '//div[@class="item-text"]/p/text()').getall()
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: spider4.py Proyecto: sepidmnorozy/newsFiles

    def parse(self, response):

        item_body_SELECTOR = '.item-body'
        text = " "
        dic = {
            "timestamp": "",
            "url": " ",
            "title": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        news_url = response.xpath('//meta[@name="twitter:url"]').xpath(
            '@content').get()
        dic["url"] = news_url

        article_section = response.xpath(
            '//meta[@property="article:section"]').xpath('@content').getall()
        dic["article_section"] = article_section

        item_summary_SELECTOR = '.item-summary p ::text'
        if (response.css(item_summary_SELECTOR).extract()):
            dic["summary"] = response.css(item_summary_SELECTOR).extract()[0]
            dic["preprocessed_summary"] = preprocess(dic["summary"])

        date = response.xpath(
            '//div[@class="barcode"]/ul/li[@class="date"]/text()').get()
        if date == None:
            date = response.xpath(
                '//div[@class="item-date"]/span/text()').get()
        list = date.split(' ')
        # print(list)
        day = convert_persian_to_english_numbers(list[0])
        month = month_dic[list[1]]
        year = convert_persian_to_english_numbers(list[2])
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        time = list[4]
        # print(convert_persian_to_english_numbers(day))
        # print(month_dic[month])
        # print(convert_persian_to_english_numbers(year))

        list_time = time.split(':')
        hour = convert_persian_to_english_numbers(list_time[0])
        minute = convert_persian_to_english_numbers(list_time[1])
        # print(convert_persian_to_english_numbers(hour))
        # print(convert_persian_to_english_numbers(minute))
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))

        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath(
            '//div[@class="barcode"]/ul/li[@class="id"]/span/text()').get()
        if code == None:
            code = response.xpath('//input[@id="newsId"]').xpath(
                '@value').get()
        dic["code"] = code

        tags = response.xpath(
            '//section[@class="box tags"]/div/ul/li/a/text()').getall()

        for brickset in response.css(item_body_SELECTOR):
            item_text_SELECTOR = '.item-text p ::text'
            paragraphs = brickset.css(item_text_SELECTOR).extract()
            for i in range(0, len(paragraphs) - 1):
                text = text + '\n' + paragraphs[i]

        dic["text"] = text

        dic["tags"] = tags

        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Ejemplo n.º 18

0

Mostrar archivo

    def parse(self, response):

        dic = {
            "title": " ",
            "timestamp": "",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        title = response.xpath('//h1[@class="Htag"]/a/text()').get()
        dic["title"] = title

        news_url = response.css('h1[class=Htag] a::attr(href)').extract()[0]
        dic["url"] = "https://www.tabnak.ir" + news_url

        sections = response.xpath(
            '//div[@class="news_path"]/a/text()').getall()
        final_sections = []
        for sec in sections:
            processed_text = " ".join(sec.split())
            final_sections.append(processed_text)

        dic["article_section"] = final_sections

        summary = response.xpath('//div[@class="subtitle"]/text()').getall()
        dic["summary"] = summary[1]

        date = response.xpath('//sapn[@class="fa_date"]/text()').get()
        date_list = date.split(' ')
        # print(date_list)
        timelist = date_list[4].split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        minute = convert_persian_to_english_numbers(timelist[1])
        day = convert_persian_to_english_numbers(date_list[0])
        month = month_dic[date_list[1]]
        year = convert_persian_to_english_numbers(date_list[2])
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))

        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath('//div[@class="news_id_c"]/text()').get()
        dic["code"] = " ".join(code.split())

        tags = response.xpath('//div[@class="tag_items"]/a/text()').getall()
        dic["tags"] = tags

        text_list = response.xpath('//div[@class="body"]/div/text()').getall()
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Ejemplo n.º 19

0

Mostrar archivo

    def parse(self, response):

        dic = {
            "timestamp": " ",
            "title": " ",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "preprocessed_text": " ",
            "w2v": [],
            "tfidf": [],
            "code": " "
        }

        title = response.xpath('//h1[@class="title"]/a/text()').get()
        dic["title"] = title
        dic["preprocessed_title"] = preprocess(dic["title"])

        news_url = response.css('h1[class=title] a::attr(href)').extract()
        if len(news_url) > 0:
            news_url = news_url[0]
        dic["url"] = "https://www.asriran.com" + news_url

        sections = response.xpath(
            '//div[@class="news_path"]/a/text()').getall()
        dic["article_section"] = sections[1:]

        summary = response.xpath('//div[@class="subtitle"]/text()').get()
        dic["summary"] = summary
        dic["preprocessed_summary"] = preprocess(dic["summary"])

        date_list = response.xpath(
            '//div[@class="news_nav news_pdate_c"]/text()').getall()

        if len(date_list) > 0:
            date = ""
            for d in date_list:
                date += d
            newdate = ''.join(date.split())
            list = newdate.split('-')
            justdate = list[1]
            justtime = list[0]
        else:
            date = response.xpath(
                '//div[@class="update_date"]/text()').getall()[0]
            newdatetmp = ''.join(date.split())
            tmp = newdatetmp.split(":")
            newdate = ':'.join(tmp[1:])
            list = newdate.split('-')
            justdate = list[0]
            justtime = list[1]

        timelist = justtime.split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        minute = convert_persian_to_english_numbers(timelist[1])
        # print(hour)
        # print(minute)
        index = 0
        for char in justdate:
            if char not in num_dic:
                index = justdate.index(char)
                break
        day = convert_persian_to_english_numbers(justdate[0:index])
        monthandyear = justdate[index:]

        for char in monthandyear:
            if char in num_dic:
                index = monthandyear.index(char)
                break

        month = month_dic[monthandyear[0:index]]
        year = convert_persian_to_english_numbers(monthandyear[index:])
        # print(month)
        # print(year)
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        # print(jalili_date)
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))
        # print(datetime_object)

        dic["date"] = str(datetime_object)

        dic["timestamp"] = datetime_object.timestamp()

        code_list = response.xpath(
            '//div[@class="news_nav news_id_c"]/text()').getall()
        code = ""
        for c in code_list:
            code += c
        dic["code"] = code

        tags = response.xpath('//div[@class="tags_title"]/a/text()').getall()
        dic["tags"] = tags

        text_parts = response.xpath('//div[@class="body"]/p/text()').getall()

        text = ""
        for text_part in text_parts:
            text += text_part

        if (len(text) < 1):
            maybe_div = response.xpath(
                '//div[@class="body"]/div/text()').getall()
            for d in maybe_div:
                text += d
            maybe_p = response.xpath('//div[@class="body"]/p/text()').getall()
            for p in maybe_p:
                text += p
            maybe_s = response.xpath(
                '//div[@class="body"]/p/span/text()').getall()
            for s in maybe_s:
                text += s

        dic["text"] = text

        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        articles.insert_one(dic)

Ejemplo n.º 20

0

Mostrar archivo

Archivo: spider14.py Proyecto: sepidmnorozy/backup-crawler

    def parse(self, response):

        dic = {
            "title": " ",
            "timestamp": "",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        title = response.xpath('//a[@itemprop="headline"]/text()').get()
        dic["title"] = title

        news_url = response.css('h1[class=title] a::attr(href)').extract()[0]
        dic["url"] = "https://www.mashreghnews.ir" + news_url

        sections = response.xpath(
            '//ol[@class="breadcrumb"]/li/a/text()').getall()
        dic["article_section"] = sections

        summary = response.xpath(
            '//p[@class="summary introtext"]/text()').get()
        dic["summary"] = summary

        date = response.xpath(
            '//div[@class="col-xs-8 col-sm-6 item-date"]/span/text()').get()
        date_list = date.split(' ')

        timelist = date_list[4].split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        minute = convert_persian_to_english_numbers(timelist[1])

        day = convert_persian_to_english_numbers(date_list[0])

        month = month_dic[date_list[1]]

        year = convert_persian_to_english_numbers(date_list[2])

        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()

        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))

        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath(
            '//div[@class="col-xs-8 col-sm-3"]/span/text()').getall()
        dic["code"] = code

        tags = response.xpath(
            '//section[@class="box tags clean list-clean list-inline header-inline header-clean negative-margin bg-graylight"]/div/ul/li/a/text()'
        ).getall()
        dic["tags"] = tags

        text_list1 = response.xpath(
            '//div[@class="item-text"]/p/span/text()').getall()
        text_list2 = response.xpath(
            '//div[@class="item-text"]/p/text()').getall()
        text = ""
        for t in text_list1:
            text += t
        for t in text_list2:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Ejemplo n.º 21

0

Mostrar archivo

Archivo: spider22.py Proyecto: sepidmnorozy/backup-crawler

    def parse(self, response):

        dic = {"title": " "}

        title = response.xpath('//div[@class="news-head"]/h6/text()').get()
        title += response.xpath('//div[@class="news-head"]/h2/text()').get()
        dic["title"] = title

        news_url = response.xpath(
            '//*[@id="st-container"]/div/div/div/main/div[1]/div/div/div/ul/li[3]/a/@href'
        ).extract()[0]
        dic["url"] = "http://behdasht.gov.ir" + news_url

        # news_path
        sections = response.xpath(
            '//*[@id="page-content"]/div/article/div/div[2]/div/div/ul/li[1]/text()'
        ).getall()
        dic["article_section"] = sections[2]

        summary = response.xpath('//div[@class="news-lead"]/p/text()').get()
        dic["summary"] = summary

        date = response.xpath(
            '//*[@id="page-content"]/div/div[1]/div/div[1]/div/ul/li[1]/span/text()'
        ).get()
        date_list = date.split(' ')
        # print(date_list)
        timelist = date_list[5].split(':')
        hour = timelist[0]
        # print(hour)
        minute = timelist[1]
        # print(minute)
        #
        date_list = date_list[0].split("/")
        # print(date_list)
        day = date_list[2]
        month = date_list[1]
        year = date_list[0]
        jalili_date = jdatetime.date(1300 + int(year), int(month),
                                     int(day)).togregorian()
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))

        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath(
            '//*[@id="page-content"]/div/article/div/div[2]/div/div/ul/li[2]/text()'
        ).getall()
        dic["code"] = code[2]

        tags = response.xpath(
            '//div[@class="es-news-tags"]/ul/li/a/text()').getall()
        dic["tags"] = tags

        text_list = response.xpath(
            '//div[@class="news-content"]/div/text()').getall()
        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: idf_search.py Proyecto: sepidmnorozy/backup-crawler

from word2vec import get_word2vec
from tfidf import get_tfidt_vector

client = MongoClient()
db = client['newsdb']
search_text = db.searches

articles = db.articles

search_result = db.searchresults

text = search_text.find().sort("_id", -1)[1000]["text"]

search_text = preprocess(text)
dic = {"preprocessed_text": search_text}
search_v_w2v = get_word2vec(dic)
search_v_tfidf = get_tfidt_vector(dic)


def similarity(vec, other_vec):
    dot = np.dot(vec, other_vec)
    norma = np.linalg.norm(vec)
    normb = np.linalg.norm(other_vec)
    cos = dot / (norma * normb)
    return cos


wanted_news = []

# for res in result:
#     dic = {"text": "", "similar_texts": []}

Ejemplo n.º 23

0

Mostrar archivo

Archivo: spider18.py Proyecto: sepidmnorozy/backup-crawler

    def parse(self, response):

        dic = {
            "title": " ",
            "timestamp": "",
            "url": " ",
            "date": " ",
            "text": " ",
            "summary": " ",
            "tags": [],
            "article_section": " ",
            "code": " "
        }

        title = response.xpath('//h1[@class="title"]/a/text()').get()
        dic["title"] = title

        news_url = response.css('h1[class=title] a::attr(href)').extract()[0]
        dic["url"] = "http://www.shafaf.ir" + news_url

        sections = []
        dic["article_section"] = sections

        summary = response.xpath('//p[@itemprop="description"]/text()').get()
        dic["summary"] = summary

        date = response.xpath(
            '//div[@class="news_nav news_pdate_c col-sm-16 col-xs-36"]/text()'
        ).getall()
        date = date[1]
        date_list = date.split(' ')
        timelist = date_list[4].split(':')
        hour = convert_persian_to_english_numbers(timelist[0])
        minute = convert_persian_to_english_numbers(timelist[1])
        day = convert_persian_to_english_numbers(date_list[0])
        month = month_dic[date_list[1]]
        year = convert_persian_to_english_numbers(date_list[2])
        jalili_date = jdatetime.date(int(year), int(month),
                                     int(day)).togregorian()
        datetime_object = datetime.datetime(jalili_date.year,
                                            jalili_date.month, jalili_date.day,
                                            int(hour), int(minute))
        dic["date"] = str(datetime_object)
        dic["timestamp"] = datetime_object.timestamp()

        code = response.xpath(
            '//div[@class="news_nav news_id_c"]/text()').get()
        # code = processed_text = " ".join(code.split())
        # code_list = code.split(' ')
        dic["code"] = code

        tags = []
        dic["tags"] = tags

        text_list1 = response.xpath('//div[@class="body"]/p/text()').getall()
        if len(text_list1) == 0:
            # item-text
            text_list2 = response.xpath(
                '//div[@class="body"]/div[@class="item-text"]/p/text()'
            ).getall()
            text_list = text_list2
        else:
            text_list = text_list1

        text = ""
        for t in text_list:
            text += t
        dic["text"] = text

        dic["preprocessed_title"] = preprocess(dic["title"])
        dic["preprocessed_summary"] = preprocess(dic["summary"])
        dic["preprocessed_text"] = preprocess(dic["text"])
        dic["w2v"] = get_word2vec(dic).tolist()
        dic["tfidf"] = get_tfidt_vector(dic).tolist()

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index='newsindex', doc_type='news', body=dic)

        client = MongoClient()
        db = client['newsdb_week']
        articles = db.weekarticles
        result = articles.insert_one(dic)

Ejemplo n.º 24

0

Mostrar archivo

Archivo: searchtest.py Proyecto: sepidmnorozy/backup-crawler

def search(text):
    try:
        client = MongoClient()
        src_db = client['newsdb_week']
        #articles = src_db.searches
        dst_db = client['webdb']
        articles = src_db.weekarticles
        searchresults = dst_db.searchresults
        #text = "بازگشایی مدارس تهران"
        #print(text)
        search_text = preprocess(text)
        search_text_tokens = search_text.split(' ')
        dic = {"preprocessed_text": search_text}
        search_v_w2v = get_word2vec(dic)
        search_v_tfidf = get_tfidt_vector(dic)
        result_w2v_list = []
        result_tfidf_list = []
        result_exact_list = []
        now = datetime.datetime.now()
        count = 0
        for a in articles.find(
            {"timestamp": {
                "$gt": now.timestamp() - 5184000.0
            }}):
            count = count + 1
            #for a in articles.find({}):
            if not np.all(search_v_w2v == 0):
                if not np.all(np.array(a["w2v"]) == 0):
                    if similarity(np.array(a["w2v"]), search_v_w2v) > 0.8:
                        result_w2v_list.append(a)
                if not np.all(np.array(a["tfidf"]) == 0):
                    if similarity(np.array(a["tfidf"]), search_v_tfidf) > 0.3:
                        result_tfidf_list.append(a)
            # for token in search_text_tokens:
            #     if token in a["text"]:
            #         result_exact_list.append(a)
            #         break
        # print("num of documents checked : ")
        # print(count)
        print(count)

        searchresults.delete_many({})
        #
        mydict = {
            "search_text": text,
            "result": result_w2v_list,
            "type": "w2v"
        }
        searchresults.insert_one(mydict)
        #
        mydict = {
            "search_text": text,
            "result": result_tfidf_list,
            "type": "tfidf"
        }
        searchresults.insert_one(mydict)
        #
        # mydict = {"search_text": text, "result": result_exact_list, "type": "exact"}
        # searchresults.insert_one(mydict)
        #
        # mydict = {"status": "done"}
        # searchstatus.insert_one(mydict)

        print("OK")
        print(len(result_tfidf_list))
        print(len(result_w2v_list))
        s = str(len(result_tfidf_list)) + str(len(result_w2v_list))
        return s
    except Exception as e:
        print(e)
        print("exception occured")
        return str(e)