def task(queue, title): doc = nlp(title) while not queue.empty(): word = queue.get() if word not in l.keys(): doc1 = nlp(word) l[word] = doc1.similarity(doc) # sen.append(word) else: print("exists") queue.task_done()
def keywords(): payload = flask.request.get_json(force=True) keywords = payload["keywords"] title = payload["title"] title = nlp(title) Asin = payload["Asin"] sort = payload["sort"] l.clear() response.clear() global comp_str comp_str = "" if sort: comp_str = title with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor: for (word, score) in executor.map(get_score, keywords): response[word] = score result = {} # dictionary to keep output key = [] # list to store keywords and relevance_score result["ASIN"] = Asin sorted_l = sorted(response.items(), key=operator.itemgetter(1), reverse=True) for r in sorted_l: key.append({'keyword': r[0], 'relevance_score': r[1]}) result["result"] = key return flask.jsonify(result) else: result = {"keywords": keywords} return flask.jsonify(result)
def analyze(request_uuid, filename): filepath = os.path.join(app.config["UPLOAD_FOLDER"], request_uuid, filename) n_max_words = 15 try: with open(filepath, "rb") as file_in: pdf_byte_content = BytesIO(file_in.read()) except PDFSyntaxError as e: return make_response(jsonify(status="KO", message=e), 400) corpus = extract_text(pdf_byte_content) app.logger.info("Loading spaCy English model may take up to 1 minute") app.logger.info("Model loaded") doc = nlp(corpus) doc_data = get_data(nlp, doc) norm_data = normalize_data(doc_data) results_dir = os.path.join(app.config["RESULTS_FOLDER"], request_uuid) create_nonexistent_dir(results_dir) save_wordcloud(corpus, results_dir) for pos in norm_data.keys(): if len(norm_data[pos]) != 0: plot_pos(norm_data[pos], results_dir, n_max_words, type_pos=pos) tp = TextPreprocessor(corpus) cleaned_text = tp.preprocess() kwords_data = kwords_count(cleaned_text) if len(kwords_data) != 0: plot_kwords(kwords_data, results_dir, n_max_words) else: app.logger.warning("No keywords found in the provided PDF") return serve_plots(request_uuid)
def keyword(): # exitFlag = 0 payload = flask.request.get_json(force=True) keywords = payload["keywords"] title = payload["title"] title = nlp(title) Asin = payload["Asin"] sort = payload["sort"] l.clear() if sort: global exitFlag exitFlag = 0 threadList = ["Thread-1", "Thread-2", "Thread-3", "Thread-4", "Thread-5", "Thread-6", "Thread-7", "Thread-8", "Thread-9", "Thread-10"] workQueue.queue.clear() threads = [] threadID = 1 # Create new threads for tName in threadList: thread = process_keywords.myThread(threadID, tName, workQueue, title) thread.start() threads.append(thread) threadID += 1 # Fill the queue queueLock.acquire() for word in keywords: workQueue.put(word.strip()) queueLock.release() # Wait for queue to empty while not workQueue.empty(): pass # Notify threads it's time to exit exitFlag = 1 # Wait for all threads to complete for t in threads: t.join() print "Exiting Main Thread" key = [] result = {} sorted_l = sorted(l.items(), key=operator.itemgetter(1), reverse=True) for r in sorted_l: key.append({'keyword': r[0], 'relevance_score': r[1]}) result["result"] = key return flask.jsonify(result) else: result = {"keywords": keywords} return flask.jsonify(result)
def get_keyword(threadName, q, title): # sorted_l = sorted(l.items(), key = operator.itemgetter(1), reverse=True) # return sorted_l while not new_app.exitFlag: new_app.queueLock.acquire() if not new_app.workQueue.empty(): data = q.get() new_app.queueLock.release() doc1 = nlp(data) score = doc1.similarity(title) new_app.l[data] = score print "%s processing %s %-100s" % (threadName, data, datetime.datetime.utcnow()) else: new_app.queueLock.release()
def get_data(nlp, doc): """ Get doc main POS data and entities with their types from a given corpus :param nlp: spacy.lang.en.English :param doc: spacy.tokens.doc.Doc object :return: dict """ app.logger.debug("Getting data from doc") nouns, adjectives = [], [] adverbs, verbs = [], [] entities, entity_types = [], [] for sent in doc.sents: for token in sent: if not token.is_oov and len(token.text) > 1: if token.pos_ == "ADV": adverbs.append(token.lower_) if token.pos_ == "VERB" and not token.is_stop: verbs.append(token.lemma_) if token.pos_ == "ADJ": adjectives.append(token.lower_) if token.pos_ == "NOUN": nouns.append(token.lower_) subdoc = nlp(sent.text) for ent in subdoc.ents: if len(ent.text) > 2: entities.append(ent.text) entity_types.append(ent.label_) adverbs_data = Counter(adverbs).most_common() verbs_data = Counter(verbs).most_common() nouns_data = Counter(nouns).most_common() adjectives_data = Counter(adjectives).most_common() entities_data = Counter(entities).most_common() entity_types_data = Counter(entity_types).most_common() data = { "Adverbs": adverbs_data, "Verbs": verbs_data, "Nouns": nouns_data, "Adjectives": adjectives_data, "Entities": entities_data, "Entity types": entity_types_data } return data
def __init__(self, path: str = None, date: str = None, index: int = None, text: str = None, candidate_pos: list = None, window_size: int = 4, damp_coef: float = 0.85, min_diff: float = 1e-5, steps: int = 10, stopwords: set = None, to_lower: bool = True): if text and path: raise BaseException('text OR path should be passed, not both.') elif not text and not path: raise BaseException('path or text should be passed.') elif (not date or len(date.split('-')) != 3) and path: raise BaseException('If path exists, date should be passed in format dd-MM-yyyy.') elif index is None and path: raise BaseException('If path exists, index should be passed.') if text: text = translator.translate(text, src='ru', dest='en').text elif path: with Storage(path=path) as storage: text = translator.translate(storage.data[date][str(index)], src='ru', dest='en').text self.doc = nlp(text) self.date = date self.index = index self.candidate_pos = candidate_pos if candidate_pos else ['NOUN', 'PRON'] self.window_size = window_size self.damp_coef = damp_coef self.min_diff = min_diff self.steps = steps self.stopwords = stopwords self.to_lower = to_lower self.node_weight = None
def get_score(keyword, ): doc1 = nlp(keyword) l[keyword] = doc1.similarity(comp_str) return keyword, l[keyword]