예제 #1
0
def create_client():
    global client
    params = (config_root.host.value, config_root.port.value,
              config_root.user.value, config_root.password.value)
    debug("Creating openHAB client: host=%s, port=%d, user=%s, password=%s" %
          params)
    client = RestClient(*params)
예제 #2
0
def authenticate_dataforseo():
    creds = None
    with open("dataforseocreds.json", 'r') as f:
        creds = json.load(f)

    if creds:
        return RestClient(creds["login"], creds["password"])
    else:
        raise ValueError('No datafroseo credentials.')
예제 #3
0
    if args.advanced:
        fields.extend(['is_image','is_video','is_featured_snippet','is_malicious','is_web_story','amp_version','rating','sitelinks','faq','items','pixels_from_top'])
    if args.knowledge_graph:
        fields.extend(['sub_title','address','phone'])

    # Output name
    timestr = time.strftime("%Y%m%d-%H%M%S")
    tag = args.output + "-" + timestr
    filename = tag + ".csv"

    with open(filename,'w',newline='') as file:
        writer = csv.DictWriter(file, fieldnames=fields, delimiter=";")
        writer.writeheader()
        file.close()

    client = RestClient(user,password)

    # While there are results, request the next batch
    next_batch = True
    while next_batch:
        response = client.get("/v3/serp/google/organic/tasks_ready")
        if response['status_code'] == 20000:
            tasks_available = response["tasks"][0]["result_count"]
            print("{} tasks available".format(tasks_available))
            if tasks_available < 1:
                next_batch = False
            results = []
            for task in response['tasks']:
                if (task['result'] and (len(task['result']) > 0)):
                    for resultTaskInfo in task['result']:
                        if(resultTaskInfo['endpoint_advanced']):
예제 #4
0
파일: rank.py 프로젝트: StanGirard/NLP_Rank
class GetList:
    client = RestClient("*****@*****.**", "2iHUO8lhRFD5vqEP")
    stop_words = set(stopwords.words("french"))
    stop_words_english = set(stopwords.words("english"))
    stop_words_multilingual = stop_words.union(stop_words_english)
    new_stopwords_list = stop_words_multilingual.union(stop)
    def create_request(self, post_data):
        response = self.client.post("/v2/live/srp_tasks_post", dict(data=post_data))
        if response["status"] == "error":
            print("error. Code: %d Message: %s" % (response["error"]["code"], response["error"]["message"]))
            return []
        else:
            return response

    def get_results_from_keywords(self, keywords):
        rnd = Random() #you can set as "index of post_data" your ID, string, etc. we will return it with all results.
        post_data = dict()
        post_data[rnd.randint(1, 30000000)] = dict(
        se_name="google.fr",
        se_language="French",
        se_localization= 'fr-fr',
        loc_id= 1006094,
        key=keywords
        )
        return self.create_request(post_data)

    def get_results_from_taskid(self, taskId):
        srp_response = self.client.get("/v2/srp_tasks_get/%d" % (taskId))
        if srp_response["status"] == "error":
            print("error. Code: %d Message: %s" % (srp_response["error"]["code"], srp_response["error"]["message"]))
            return []
        else:
            return srp_response
    def get_results(self, keywords):
        allinfo = []
        if (type(keywords) is int):
            allinfo = self.get_results_from_taskid(keywords)
        else:
            allinfo = self.get_results_from_keywords(keywords)
        return allinfo

    def extract_url_from_results(self, data):
        url_result = []
        for i in data["results"]["organic"]:
            url_result.append(i["result_url"])
        return url_result
    
    def extract_text_from_url(self, list):
        text = []
        count = 0
        for i in list:
            try:
                html = requests.get(i).text
                #html = urllib.request.urlopen(i, timeout=3).read().decode("utf8")
                text.append(get_text(html))
                count += 1
            except:
                print("Error: " + i)
        print("Count = ", count)
        return text

    def normalize_text_list(self, textArray):
        corpus = []
        for i in range(len(textArray)):
            #Remove accents
            text = re.sub("'","  ", textArray[i])

            text = re.sub('"'," ", text)

            # Removes urls
            text = re.sub("/(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[A-Z0-9+&@#\/%=~_|$])/igm", " ", text)

            #text = str(unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore'))

            #Remove Special characters
            #text = re.sub('[^a-zA-Z0-9]', ' ', text)
            text = re.sub('[^a-zA-Z0-9áéèíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇû: ]', ' ', text)
            
            #Convert to lowercase
            text = text.lower()
            
            #remove tags
            text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
            
            # remove special characters and digits
            text=re.sub("(\\d|\\W)+"," ",text)
            
            ##Convert to list from string
            text = text.split()
            
            ##Stemming
            #ps=PorterStemmer()
            #Lemmatisation
            #lem = WordNetLemmatizer()
            text = [word for word in text if not word in  
                    self.new_stopwords_list] 
            text = " ".join(text)
            corpus.append(text)
        return corpus
    def write_corpus_to_file(self,corpus):
        with open("corpus.txt", "wb") as fp:
            pickle.dump(corpus, fp)

    def write_corpus_to_file_from_keywords(self, keywords):
        allinfo = self.get_results(keywords)
        vim = self.extract_url_from_results(allinfo)
        tex = self.extract_text_from_url(vim)
        corpus = self.normalize_text_list(tex)
        self.write_corpus_to_file(tex)
    def write_corpus_to_file_not_normalized_from_keywords(self, keywords):
        allinfo = self.get_results(keywords)
        vim = self.extract_url_from_results(allinfo)
        tex = self.extract_text_from_url(vim)
        self.write_corpus_to_file(tex)
    
    def read_corpus_from_file(self, filename):
        corpus = []
        with open(filename, 'rb') as fp:
            corpus = pickle.load(fp)
        return corpus

    def get_top_n_ygrams_words(self, corpus, n=None, y=1):
        vec = CountVectorizer(min_df = 0.1,stop_words=self.new_stopwords_list, max_features=10000, ngram_range=(y,y)).fit(corpus)
        bag_of_words  = vec.transform(corpus)
        sum_words = bag_of_words.sum(axis=0) 
        words_freq = [(word, sum_words[0, idx]) for word, idx in      
                    vec.vocabulary_.items()]
        words_freq =sorted(words_freq, key = lambda x: x[1], 
                        reverse=True)
        frame =  pd.DataFrame(words_freq[:n])
        switcher = {
        1: "Monogram",
        2: "Bigram",
        3: "Trigram",
        }
        switcherFreq = {
        1: "Freq_Mono",
        2: "Freq_Bi",
        3: "Freq_Tri",
        }
        frame.columns = [switcher.get(y), switcherFreq.get(y)]
        return frame
    
    def get_all_ygrams(self,corpus, monograms, bigrams, trigrams):
        mono = self.get_top_n_ygrams_words(corpus, monograms, 1)
        bi = self.get_top_n_ygrams_words(corpus, bigrams, 2)
        tri = self.get_top_n_ygrams_words(corpus, trigrams, 3)
        monodict = mono.to_dict()
        bidict = bi.to_dict()
        tridict = tri.to_dict()
        all = {}
        all.update(monodict)
        all.update(bidict)
        all.update(tridict)
        return all
    
    def prepare_corpus(self,doc_clean):
        """
        Input  : clean document
        Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
        Output : term dictionary and Document Term Matrix
        """
        # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
        dictionary = corpora.Dictionary(doc_clean)
        # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
        doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
        # generate LDA model
        return dictionary,doc_term_matrix
    def create_gensim_lsa_model(self,doc_clean,number_of_topics,words):
        """
        Input  : clean document, number of topics and number of words associated with each topic
        Purpose: create LSA model using gensim
        Output : return LSA model
        """
        dictionary,doc_term_matrix=self.prepare_corpus(doc_clean)
        # generate LSA model
        lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
        print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
        return lsamodel
    
    def compute_coherence_values(self,dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):
        """
        Input   : dictionary : Gensim dictionary
                corpus : Gensim corpus
                texts : List of input texts
                stop : Max num of topics
        purpose : Compute c_v coherence for various number of topics
        Output  : model_list : List of LSA topic models
                coherence_values : Coherence values corresponding to the LDA model with respective number of topics
        """
        coherence_values = []
        model_list = []
        for num_topics in range(start, stop, step):
            # generate LSA model
            print("Number of topics: ", num_topics)
            model = LsiModel(doc_term_matrix, num_topics=num_topics, id2word = dictionary)  # train model
            model_list.append(model)
            coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
            coherenceSS = coherencemodel.get_coherence()
            coherence_values.append(coherenceSS)
        return model_list, coherence_values
    
    def plot_graph(self, doc_clean,start, stop, step):
        dictionary,doc_term_matrix=self.prepare_corpus(doc_clean)
        
        model_list, coherence_values = self.compute_coherence_values(dictionary, doc_term_matrix,doc_clean,
                                                                stop, start, step)
        # Show graph
        
        x = range(start, stop, step)
        for i in range(len(coherence_values)):
            if float(coherence_values[i]) > float("0.4"):
                print(coherence_values[i])
        
        plt.plot(x, coherence_values)
        plt.xlabel("Number of Topics")
        plt.ylabel("Coherence score")
        plt.legend(("coherence_values"), loc='best')
        plt.show()
    def calculate_optimal_coherence_value(self, doc_clean, start, stop, step):
        dictionary,doc_term_matrix=self.prepare_corpus(doc_clean)
        model_list, coherence_values = self.compute_coherence_values(dictionary, doc_term_matrix,doc_clean,
                                                                stop, start, step)
        max = 0
        topics = 0

        for i in range(len(coherence_values)):
            if float(coherence_values[i]) > float("0.35"):
                if coherence_values[i] > max:
                    topics = i
                print(coherence_values[i])
        return topics
    def generate_optimal_topic(self, doc_clean, start, stop, step):
        topics = self.calculate_optimal_coherence_value(doc_clean, start, stop, step)
        print(self.create_gensim_lsa_model(doc_clean,topics,3))
    
    def corpus_to_gensim(self,corpus):
        gensim = []
        for i in corpus:
            token = i.split()
            gensim.append(token)
        return gensim

    def generate_results(self, keywords):
        allinfo = self.get_results(keywords)
        vim = self.extract_url_from_results(allinfo)
        tex = self.extract_text_from_url(vim)
        corpus = self.normalize_text_list(tex)
        gensim = self.corpus_to_gensim(corpus)
        #model = self.create_gensim_lsa_model(gensim,10,3)
        start,stop,step = 2,10,1
        #self.plot_graph(gensim,start,stop,step)
        self.generate_optimal_topic(gensim,start,stop,step)
        results = self.get_all_ygrams(corpus, 40, 20, 10)
        print(results)
        return results

    def generate_results_from_file(self, filename):
        corpus = self.read_corpus_from_file(filename)
        corpus = self.normalize_text_list(corpus)
        gensim = self.corpus_to_gensim(corpus)
        #model = self.create_gensim_lsa_model(gensim,10,3)
        start,stop,step = 2,10,1
        self.generate_optimal_topic(gensim,start,stop,step)
        results = self.get_all_ygrams(corpus, 100, 40, 10)
        print(results)
        return 1

    def generate_results_from_url(self, url):
        rscraper = RecursiveScraper(url)
        rscraper.scrape()
        rscraper.urls
        tex = self.extract_text_from_url(rscraper.urls)
        corpus = self.normalize_text_list(tex)
        gensim = self.corpus_to_gensim(corpus)
        #model = self.create_gensim_lsa_model(gensim,10,3)
        start,stop,step = 2,10,1
        self.generate_optimal_topic(gensim,start,stop,step)
        results = self.get_all_ygrams(corpus, 40, 20, 10)
        print(results)
        return results
 def __init__(self):
     self.client = RestClient(LOGIN, PASSWORD)
예제 #6
0
from client import RestClient
import json, logging, requests, random

client = RestClient('*****@*****.**', 'LD87rm8Od9dfWnVn')

# Search class has modules for different functions
class Search():
    # https://docs.dataforseo.com/#get-related-keywords
    def keywords_related(self, word, location):
        rnd = random.Random() #you can set as "index of post_data" your ID, string, etc. we will return it with all results.
        post_data = dict()

        post_data[rnd.randint(1, 30000000)] = dict(
            keyword=word,
            country_code=location,
            language="en",
            depth=1,
            limit=2,
            offset=0,
            orderby="cpc,desc",
            filters=[
                ["cpc", ">", 0],
                "or", 
                [
                    ["search_volume", ">", 0],
                    "and",
                    ["search_volume", "<=", 1000]
                ]
            ]
        )