コード例 #1
0
def task(queue, title):
    doc = nlp(title)
    while not queue.empty():

        word = queue.get()
        if word not in l.keys():
            doc1 = nlp(word)
            l[word] = doc1.similarity(doc)
            # sen.append(word)
        else:
            print("exists")
        queue.task_done()
コード例 #2
0
ファイル: fast_api.py プロジェクト: rockycoder/lang_research
def keywords():
    payload = flask.request.get_json(force=True)
    keywords = payload["keywords"]
    title = payload["title"]
    title = nlp(title)
    Asin = payload["Asin"]
    sort = payload["sort"]
    l.clear()
    response.clear()
    global comp_str
    comp_str = ""
    if sort:
        comp_str = title
        with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:

            for (word, score) in executor.map(get_score, keywords):
                response[word] = score

        result = {}  # dictionary to keep output
        key = []  # list to store keywords and relevance_score
        result["ASIN"] = Asin
        sorted_l = sorted(response.items(),
                          key=operator.itemgetter(1),
                          reverse=True)
        for r in sorted_l:
            key.append({'keyword': r[0], 'relevance_score': r[1]})
        result["result"] = key
        return flask.jsonify(result)

    else:
        result = {"keywords": keywords}
        return flask.jsonify(result)
コード例 #3
0
def analyze(request_uuid, filename):
    filepath = os.path.join(app.config["UPLOAD_FOLDER"], request_uuid,
                            filename)
    n_max_words = 15
    try:
        with open(filepath, "rb") as file_in:
            pdf_byte_content = BytesIO(file_in.read())
    except PDFSyntaxError as e:
        return make_response(jsonify(status="KO", message=e), 400)
    corpus = extract_text(pdf_byte_content)
    app.logger.info("Loading spaCy English model may take up to 1 minute")
    app.logger.info("Model loaded")
    doc = nlp(corpus)
    doc_data = get_data(nlp, doc)
    norm_data = normalize_data(doc_data)
    results_dir = os.path.join(app.config["RESULTS_FOLDER"], request_uuid)
    create_nonexistent_dir(results_dir)
    save_wordcloud(corpus, results_dir)
    for pos in norm_data.keys():
        if len(norm_data[pos]) != 0:
            plot_pos(norm_data[pos], results_dir, n_max_words, type_pos=pos)
    tp = TextPreprocessor(corpus)
    cleaned_text = tp.preprocess()
    kwords_data = kwords_count(cleaned_text)
    if len(kwords_data) != 0:
        plot_kwords(kwords_data, results_dir, n_max_words)
    else:
        app.logger.warning("No keywords found in the provided PDF")
    return serve_plots(request_uuid)
コード例 #4
0
def keyword():
    # exitFlag = 0

    payload = flask.request.get_json(force=True)
    keywords = payload["keywords"]
    title = payload["title"]
    title = nlp(title)
    Asin = payload["Asin"]
    sort = payload["sort"]
    l.clear()

    if sort:
        global exitFlag
        exitFlag = 0

        threadList = ["Thread-1", "Thread-2", "Thread-3", "Thread-4", "Thread-5", "Thread-6", "Thread-7", "Thread-8",
                      "Thread-9", "Thread-10"]

        workQueue.queue.clear()
        threads = []
        threadID = 1

        # Create new threads
        for tName in threadList:
            thread = process_keywords.myThread(threadID, tName, workQueue, title)
            thread.start()
            threads.append(thread)
            threadID += 1

        # Fill the queue
        queueLock.acquire()
        for word in keywords:
            workQueue.put(word.strip())
        queueLock.release()

        # Wait for queue to empty
        while not workQueue.empty():
            pass

        # Notify threads it's time to exit
        exitFlag = 1

        # Wait for all threads to complete
        for t in threads:
            t.join()
        print "Exiting Main Thread"
        key = []
        result = {}
        sorted_l = sorted(l.items(), key=operator.itemgetter(1), reverse=True)
        for r in sorted_l:
            key.append({'keyword': r[0], 'relevance_score': r[1]})
        result["result"] = key
        return flask.jsonify(result)
    else:
        result = {"keywords": keywords}
        return flask.jsonify(result)
コード例 #5
0
def get_keyword(threadName, q, title):
    # sorted_l = sorted(l.items(), key = operator.itemgetter(1), reverse=True)
    # return sorted_l
    while not new_app.exitFlag:
        new_app.queueLock.acquire()
        if not new_app.workQueue.empty():
            data = q.get()
            new_app.queueLock.release()
            doc1 = nlp(data)
            score = doc1.similarity(title)
            new_app.l[data] = score
            print "%s processing %s  %-100s" % (threadName, data,
                                                datetime.datetime.utcnow())
        else:
            new_app.queueLock.release()
コード例 #6
0
def get_data(nlp, doc):
    """
    Get doc main POS data and entities
    with their types from a given corpus
    :param nlp: spacy.lang.en.English
    :param doc: spacy.tokens.doc.Doc object
    :return: dict
    """
    app.logger.debug("Getting data from doc")
    nouns, adjectives = [], []
    adverbs, verbs = [], []
    entities, entity_types = [], []
    for sent in doc.sents:
        for token in sent:
            if not token.is_oov and len(token.text) > 1:
                if token.pos_ == "ADV":
                    adverbs.append(token.lower_)
                if token.pos_ == "VERB" and not token.is_stop:
                    verbs.append(token.lemma_)
                if token.pos_ == "ADJ":
                    adjectives.append(token.lower_)
                if token.pos_ == "NOUN":
                    nouns.append(token.lower_)
        subdoc = nlp(sent.text)
        for ent in subdoc.ents:
            if len(ent.text) > 2:
                entities.append(ent.text)
                entity_types.append(ent.label_)
    adverbs_data = Counter(adverbs).most_common()
    verbs_data = Counter(verbs).most_common()
    nouns_data = Counter(nouns).most_common()
    adjectives_data = Counter(adjectives).most_common()
    entities_data = Counter(entities).most_common()
    entity_types_data = Counter(entity_types).most_common()
    data = {
        "Adverbs": adverbs_data,
        "Verbs": verbs_data,
        "Nouns": nouns_data,
        "Adjectives": adjectives_data,
        "Entities": entities_data,
        "Entity types": entity_types_data
    }
    return data
コード例 #7
0
    def __init__(self, path: str = None,
                 date: str = None,
                 index: int = None,
                 text: str = None,
                 candidate_pos: list = None,
                 window_size: int = 4,
                 damp_coef: float = 0.85,
                 min_diff: float = 1e-5,
                 steps: int = 10,
                 stopwords: set = None,
                 to_lower: bool = True):

        if text and path:
            raise BaseException('text OR path should be passed, not both.')
        elif not text and not path:
            raise BaseException('path or text should be passed.')
        elif (not date or len(date.split('-')) != 3) and path:
            raise BaseException('If path exists, date should be passed in format dd-MM-yyyy.')
        elif index is None and path:
            raise BaseException('If path exists, index should be passed.')

        if text:
            text = translator.translate(text, src='ru', dest='en').text
        elif path:
            with Storage(path=path) as storage:
                text = translator.translate(storage.data[date][str(index)], src='ru', dest='en').text

        self.doc = nlp(text)
        self.date = date
        self.index = index

        self.candidate_pos = candidate_pos if candidate_pos else ['NOUN', 'PRON']
        self.window_size = window_size
        self.damp_coef = damp_coef
        self.min_diff = min_diff
        self.steps = steps
        self.stopwords = stopwords
        self.to_lower = to_lower
        self.node_weight = None
コード例 #8
0
ファイル: fast_api.py プロジェクト: rockycoder/lang_research
def get_score(keyword, ):
    doc1 = nlp(keyword)
    l[keyword] = doc1.similarity(comp_str)
    return keyword, l[keyword]