コード例 #1
0
    def get_tweets(self, index, doc_field):

        my_connector = Es_connector(index=index)
        all_tweets = []
        query = {
            "_source": [doc_field, "timestamp_ms"],
            "query": {
                "exists": {
                    "field": doc_field
                }
            }
        }
        res = my_connector.init_paginatedSearch(query)
        sid = res["sid"]
        scroll_size = res["scroll_size"]

        # Analyse and process page by page
        processed_tweets = 0
        while scroll_size > 0:

            tweets = res["results"]
            all_tweets.extend([{
                '_source': {
                    doc_field:
                    self.tknzr.tokenize(tweet["_source"][doc_field]),
                    "timestamp_ms": tweet["_source"]["timestamp_ms"]
                }
            } for tweet in tweets])
            processed_tweets += scroll_size

            res = my_connector.loop_paginatedSearch(sid, scroll_size)
            scroll_size = res["scroll_size"]

        return all_tweets
コード例 #2
0
 def get_tweets_query_state(self,
                            index="test3",
                            word="",
                            state="proposed",
                            session=""):
     my_connector = Es_connector(index=index)
     query = {
         "query": {
             "bool": {
                 "must": {
                     "simple_query_string": {
                         "fields": ["text"],
                         "query": word
                     }
                 },
                 "filter": {
                     "bool": {
                         "should": [{
                             "match": {
                                 session: state
                             }
                         }]
                     }
                 }
             }
         }
     }
     res = my_connector.init_paginatedSearch(query)
     return res
コード例 #3
0
 def get_event_tweets(self, index="test3", main_term="", related_terms=""):
     my_connector = Es_connector(index=index)
     terms = []
     words = main_term + ' '
     for t in related_terms:
         terms.append(
             {"match": {
                 "text": {
                     "query": t['word'],
                     "boost": t['value']
                 }
             }})
         words += t['word'] + " "
     terms.append({"match": {"text": {"query": main_term, "boost": 2}}})
     # res = my_connector.search({"query": {"term" : { "text" : word }}})
     # query = {
     #     "bool": {
     #         "must": {
     #             "match": {
     #                 "text": {
     #                     "query": main_term,
     #                     "operator": "or"
     #                 }
     #             }
     #         },
     #         "should": terms
     #     }
     # }
     query = {"sort": ["_score"], "query": {"bool": {"should": terms}}}
     # print(query)
     # res = my_connector.search(query)
     res = my_connector.init_paginatedSearch(query)
     return res
コード例 #4
0
    def get_tweets(self, index="test3", word=""):
        my_connector = Es_connector(index=index)
        # res = my_connector.search({
        #         "query": {
        #             "simple_query_string": {
        #               "fields": [
        #                 "text"
        #               ],
        #               "query": word
        #             }
        #           }
        #         })

        # res = my_connector.bigSearch(
        #     {
        #         "_source": ["text", "id_str", "extended_entities", "user", "created_at", "link"],
        #         "query": {
        #             "simple_query_string": {
        #               "fields": [
        #                 "text"
        #               ],
        #               "query": word
        #             }
        #           }
        #     })

        res = my_connector.init_paginatedSearch({
            "query": {
                "simple_query_string": {
                    "fields": ["text"],
                    "query": word
                }
            }
        })
        return res
コード例 #5
0
 def get_tweets_state(self, index="test3", session="", state="proposed"):
     my_connector = Es_connector(index=index)
     res = my_connector.init_paginatedSearch(
         {"query": {
             "term": {
                 "session_" + session: state
             }
         }})
     return res
コード例 #6
0
    def get_event_tweets2(self,
                          index="test3",
                          main_term="",
                          related_terms="",
                          cid=0):
        my_connector = Es_connector(index=index)
        terms = []
        words = main_term + ' '
        for t in related_terms:
            terms.append(
                {"match": {
                    "text": {
                        "query": t['word'],
                        "boost": t['value']
                    }
                }})
            words += t['word'] + " "
        terms.append({"match": {"text": {"query": main_term, "boost": 2}}})
        # terms.append({"match": {
        #     "imagesCluster": {
        #         "query": cid
        #     }
        # }})
        # query = {
        #         "query": {
        #                 "bool": {
        #                     "must": {
        #                         "exists": {
        #                             "field": "imagesCluster"
        #                         }
        #                     },
        #                     # "must": { "match": { "imagesCluster" : cid }},
        #                     "should": terms
        #                 }
        #             }
        #         }

        query = {
            "sort": ["_score"],
            "query": {
                "bool": {
                    "should": terms,
                    "minimum_should_match": 1,
                    "must": [{
                        "match": {
                            "imagesCluster": cid
                        }
                    }]
                }
            }
        }

        # res = my_connector.bigSearch(query)
        res = my_connector.init_paginatedSearch(query)
        return res
コード例 #7
0
    def get_event_filter_tweets(self,
                                index="test3",
                                main_term="",
                                related_terms="",
                                state="proposed",
                                session=""):
        my_connector = Es_connector(index=index)
        terms = []
        words = main_term + ' '
        for t in related_terms:
            terms.append(
                {"match": {
                    "text": {
                        "query": t['word'],
                        "boost": t['value']
                    }
                }})
            words += t['word'] + " "
        terms.append({"match": {"text": {"query": main_term, "boost": 2}}})
        # query = {
        #     "sort": [
        #         "_score"
        #     ],
        #         "query": {
        #                 "bool": {
        #                     "should": terms
        #                 }
        #             }
        #         }

        query = {
            "sort": ["_score"],
            "query": {
                "bool": {
                    "must": [{
                        "bool": {
                            "should": terms
                        }
                    }],
                    "filter": {
                        "bool": {
                            "should": [{
                                "match": {
                                    session: state
                                }
                            }]
                        }
                    }
                }
            }
        }
        res = my_connector.init_paginatedSearch(query)
        return res
コード例 #8
0
    def generate_ngrams_for_index(self, **kwargs):

        try:
            # Get the data for performinga paginated search
            self.current_thread_percentage = 0
            print("Starting")
            my_connector = Es_connector(index=kwargs["index"])

            query = kwargs.get('query', {"query": {"match_all": {}}})

            res = my_connector.init_paginatedSearch(query)
            sid = res["sid"]
            scroll_size = res["scroll_size"]
            total = int(res["total"])

            # Analyse and process page by page
            i = 0
            total_scrolls = int(total / scroll_size)
            processed_scrolls = 0

            print("from_property:", kwargs['from_property'])

            while scroll_size > 0:
                tweets = res["results"]
                self.gerenate_ngrams_for_tweets(
                    tweets,
                    from_property=kwargs['from_property'],
                    prop=kwargs["prop"],
                    index=kwargs["index"],
                    length=kwargs["length"])

                i += 1
                res = my_connector.loop_paginatedSearch(sid, scroll_size)
                scroll_size = res["scroll_size"]
                processed_scrolls += 1

                self.current_thread_percentage = round(
                    processed_scrolls * 100 / total_scrolls, 0)

                print("Completed: ", self.current_thread_percentage, "%")

            # Clean it at the end so the clien knows when to end asking for more logs
            self.current_thread_percentage = 100

            return True

        except Exception as e:
            print('Error: ' + str(e))
            return False
コード例 #9
0
 def get_big_tweets_scroll(self, index="test3", word=""):
     my_connector = Es_connector(index=index)
     res = my_connector.init_paginatedSearch({
         "_source": [
             "text", "id_str", "extended_entities", "user", "created_at",
             "link"
         ],
         "query": {
             "simple_query_string": {
                 "fields": ["text"],
                 "query": word
             }
         }
     })
     return res
コード例 #10
0
    def search_bigrams_related_tweets(self, **kwargs):

        my_connector = Es_connector(index=kwargs["index"])
        if kwargs.get('full_search', False):  # All tweets
            query = {
                "query": {
                    "bool": {
                        "must": [{
                            "match_phrase": {
                                kwargs["ngramsPropName"]: kwargs["ngram"]
                            }
                        }, {
                            "match": {
                                kwargs["session"]: kwargs["label"]
                            }
                        }]
                    }
                }
            }
        else:  # matching keywords
            query = {
                "query": {
                    "bool": {
                        "must": [{
                            "match": {
                                "text": kwargs["word"]
                            }
                        }, {
                            "match_phrase": {
                                kwargs["ngramsPropName"]: kwargs["ngram"]
                            }
                        }, {
                            "match": {
                                kwargs["session"]: kwargs["label"]
                            }
                        }]
                    }
                }
            }

        print(query)

        return my_connector.init_paginatedSearch(query)
コード例 #11
0
    def download_tweets_from_elastic(self, **kwargs):

        debug_limit = kwargs.get("debug_limit", False)
        log_enabled = kwargs.get("log_enabled", True)

        if "config_relative_path" in kwargs:
            my_connector = Es_connector(
                index=kwargs["index"],
                doc_type="tweet",
                config_relative_path=kwargs["config_relative_path"])
        else:
            my_connector = Es_connector(
                index=kwargs["index"],
                doc_type="tweet")  #  config_relative_path='../')

        res = my_connector.init_paginatedSearch(kwargs["query"])
        sid = res["sid"]
        scroll_size = res["scroll_size"]
        total = int(res["total"])
        processed = len(res["results"])

        self.write_data_in_folders(kwargs["field"], kwargs["folder"],
                                   res["results"])

        while scroll_size > 0:
            res = my_connector.loop_paginatedSearch(sid, scroll_size)
            scroll_size = res["scroll_size"]
            processed += len(res["results"])

            # Writing the retrieved files into the folders
            self.write_data_in_folders(kwargs["field"], kwargs["folder"],
                                       res["results"])
            if log_enabled:
                print("Downloading: ", round(processed * 100 / total, 2), "%")

            if debug_limit:
                print("\nDEBUG LIMIT\n")
                res = my_connector.loop_paginatedSearch(sid, scroll_size)
                self.write_data_in_folders(kwargs["field"], kwargs["folder"],
                                           res["results"])
                scroll_size = 0

        return total
コード例 #12
0
    def search_event_bigrams_related_tweets(self, **kwargs):

        my_connector = Es_connector(index=kwargs["index"])
        query = {
            "query": {
                "bool": {
                    "should":
                    kwargs["target_terms"],
                    "minimum_should_match":
                    1,
                    "must": [{
                        "match_phrase": {
                            kwargs["ngramsPropName"]: kwargs["ngram"]
                        }
                    }, {
                        "match": {
                            kwargs["session"]: kwargs["label"]
                        }
                    }]
                }
            }
        }

        return my_connector.init_paginatedSearch(query)
コード例 #13
0
            index=index,
            doc_type="tweet",
            id=tweet["_id"],
            body={"doc": {
                output_field: full_text
            }})
    print("Languages for stopwords: ", ngramsAnalizer.retrievedLangs)


try:
    my_connector = Es_connector(index=index)
    #query = #"query": {
    #"match_all": {}
    #}
    query = {"query": {"match": {"lang": "en or fr or es"}}}
    res = my_connector.init_paginatedSearch(query=query)

    sid = res["sid"]
    scroll_size = res["scroll_size"]
    init_total = int(res["total"])
    accum_total = 0

    print("\nTotal = ", init_total)
    print("\nScroll = ", scroll_size)
    print("\nLangs = ", langs)

    while scroll_size > 0:

        generate_text_images_prop(res["results"], langs)
        res = my_connector.loop_paginatedSearch(sid, scroll_size)
        scroll_size = res["scroll_size"]