def compute_textual_indices(text): lang = str_to_lang("de") model = get_default_model(lang) #model = VECTOR_MODELS[lang][CorporaEnum.WIKI][VectorModelType.WORD2VEC]( # name=CorporaEnum.WIKI.value, lang=Lang.DE) doc = Document(lang, text) cna_graph = CnaGraph(docs=doc, models=[model]) compute_indices(doc=doc, cna_graph=cna_graph) block = [] for b in doc.get_blocks(): sent = [] for s in b.get_sentences(): sent.append(s.text) block.append({'text': b.text, 'sentences': sent}) feedback_text = {'doc': doc.text, 'blocks': block} sentences = [sent.indices for sent in doc.get_sentences()] blocks = [block.indices for block in doc.get_blocks()] return { 'text': feedback_text, 'indices': { 'document': doc.indices, 'sentence': sentences, 'block': blocks } }
def modify_sentence(sent: str): global args, speller, word_set, LOWER, MATCH_ALPHA_WORD doc = Document(text=sent, lang=Lang.RO) tokens = [token for token in doc.get_words()] tokenst = [token.text for token in doc.get_words()] tokents = modify_words(tokenst) tokents = modify_chars(tokenst) return reconstruct_sentence(tokenst)
def textSimilarityPost(): params = json.loads(request.get_data()) corpus = params.get('corpus') if params.get( 'corpus') != None else 'le_monde_small' languageString = params.get('language') lang = str_to_lang(languageString) texts = params.get('texts') vectorModels = [] try: vectorModel = LSA(corpus, lang) vectorModels.append(vectorModel) except FileNotFoundError as inst: print(inst) try: vectorModel = LDA(corpus, lang) vectorModels.append(vectorModel) except FileNotFoundError as inst: print(inst) try: vectorModel = Word2Vec(corpus, lang) vectorModels.append(vectorModel) except FileNotFoundError as inst: print(inst) noTexts = len(texts) pairs = [] for i in range(0, noTexts): document1 = Document(lang, texts[i]) for j in range(i + 1, noTexts): document2 = Document(lang, texts[j]) scores = [] for vectorModel in vectorModels: similarityScore = vectorModel.similarity( document1, document2) scoreDTO = ScoreDTO(vectorModel.type.name, similarityScore) scores.append(scoreDTO) pairDTO = PairDTO(i, j, scores) pairs.append(pairDTO) # print(pairs) scoresDTO = ScoresDTO(lang, corpus, pairs) textSimilarityResponse = TextSimilarityResponse(scoresDTO, "", True) jsonString = textSimilarityResponse.toJSON() return jsonString
def compute_graph(dataName, JsonName, texts: List[str], lang: Lang, models: List) -> str: docs = [Document(lang=lang, text=text) for text in texts] models = [ create_vector_model(lang, VectorModelType.from_str(model["model"]), model["corpus"]) for model in models ] models = [model for model in models if model is not None] graph = CnaGraph(docs=docs, models=models) sentence_index = 1 doc_index = 1 names = {} for doc_index, doc in enumerate(docs): names[doc] = "Document {}".format(doc_index + 1) for paragraph_index, paragraph in enumerate(doc.components): names[paragraph] = "Paragraph {}.{}".format( doc_index + 1, paragraph_index + 1) for sentence_index, sentence in enumerate(paragraph.components): names[sentence] = "Sentence {}.{}.{}".format( doc_index + 1, paragraph_index + 1, sentence_index + 1) result = { "data": { "name": "Document Set", "value": None, "type": None, "importance": None, "children": [encode_element(doc, names, graph, lang) for doc in docs] } } edges = {} for a, b, data in graph.graph.edges(data=True): if data["type"] is not EdgeType.ADJACENT and data[ "type"] is not EdgeType.PART_OF: if data["type"] is EdgeType.COREF: edge_type = EdgeType.COREF.name else: edge_type = "{}: {}".format(data["type"].name, data["model"].name) if (names[a], names[b]) not in edges: edges[(names[a], names[b])] = [] edge = { "name": edge_type, "weight": str(data["value"]) if "value" in data else None, "details": data["details"] if "details" in data else None, } edges[(names[a], names[b])].append(edge) edges = [{ "source": pair[0], "target": pair[1], "types": types, } for pair, types in edges.items()] compute_nxGraph(dataName, JsonName, docs, names, graph, edges, lang) result["data"]["edges"] = edges return result
def compute_indices_format(text): lang = str_to_lang("de") model = get_default_model(lang) #model = VECTOR_MODELS[lang][CorporaEnum.WIKI][VectorModelType.WORD2VEC]( # name=CorporaEnum.WIKI.value, lang=Lang.DE) doc = Document(Lang.DE, text) cna_graph = CnaGraph(docs=doc, models=[model]) compute_indices(doc=doc, cna_graph=cna_graph) indices = {} for key, value in doc.indices.items(): indices.update({str(key): value}) return indices
def filter_by_similarity(kept_lessons, text, models, lang, otherdomains): aux_lessons = copy.deepcopy(kept_lessons) print(aux_lessons) if otherdomains: for lesson in aux_lessons: is_similar = 0 print(lesson['published_title']) for learn in lesson['learn_details']: document1 = Document(lang, text) document2 = Document(lang, learn) for vectorModel in models: similarity_score = vectorModel.similarity( document1, document2) if similarity_score > threshold_other: is_similar = 1 print(similarity_score) if not is_similar: kept_lessons.remove(lesson) else: for lesson_descriptives, lesson in aux_lessons.items(): document1 = Document(lang, text) document2 = Document(lang, lesson.description) is_similar = 0 for vectorModel in models: similarity_score = vectorModel.similarity( document1, document2) if similarity_score > threshold: lesson.similarityScore = similarity_score is_similar = 1 if not is_similar: del kept_lessons[lesson_descriptives] print(len(kept_lessons)) return kept_lessons
def compute_indices_format_array(questions): lang = str_to_lang("de") model = get_default_model(lang) #model = VECTOR_MODELS[lang][CorporaEnum.WIKI][VectorModelType.WORD2VEC]( # name=CorporaEnum.WIKI.value, lang=Lang.DE) result = [] for question in questions: doc = Document(Lang.DE, question['expert']) cna_graph = CnaGraph(docs=doc, models=[model]) compute_indices(doc=doc, cna_graph=cna_graph) doc_indices = {} for key, value in doc.indices.items(): doc_indices.update({str(key): value}) doc = Document(Lang.DE, question['text']) cna_graph = CnaGraph(docs=doc, models=[model]) compute_indices(doc=doc, cna_graph=cna_graph) block = [] for b in expert.get_blocks(): sent = [] for s in b.get_sentences(): sent.append(s.text) block.append({'text': b.text, 'sentences': sent}) feedback_text = {'doc': doc.text, 'blocks': block} sentences = [sent.indices for sent in doc.get_sentences()] blocks = [block.indices for block in doc.get_blocks()] doc_indices = { 'text': feedback_text, 'indices': { 'document': expert.indices, 'sentence': sentences, 'block': blocks } } level = predictLevel(doc_indices['indices']['document']) result.append({ 'feedback': compare_feedback(expert, doc_indices), 'level': level }) return result
def textualComplexityPost(): params = json.loads(request.get_data()) text = params.get('text') languageString = params.get('language') lang = str_to_lang(languageString) # lsa = params.get('lsa') # lda = params.get('lda') # w2v = params.get('w2v') if lang is Lang.RO: vector_model = VECTOR_MODELS[lang][CorporaEnum.README][ VectorModelType.WORD2VEC](name=CorporaEnum.README.value, lang=lang) elif lang is Lang.EN: vector_model = VECTOR_MODELS[lang][CorporaEnum.COCA][ VectorModelType.WORD2VEC](name=CorporaEnum.COCA.value, lang=lang) elif lang is Lang.ES: vector_model = VECTOR_MODELS[lang][CorporaEnum.JOSE_ANTONIO][ VectorModelType.WORD2VEC](name=CorporaEnum.JOSE_ANTONIO.value, lang=lang) elif lang is Lang.FR: vector_model = VECTOR_MODELS[lang][CorporaEnum.LE_MONDE][ VectorModelType.WORD2VEC](name=CorporaEnum.LE_MONDE.value, lang=lang) elif lang is Lang.RU: vector_model = VECTOR_MODELS[lang][CorporaEnum.RNC_WIKIPEDIA][ VectorModelType.WORD2VEC](name=CorporaEnum.RNC_WIKIPEDIA.value, lang=lang) elif lang is Lang.DE: vector_model = VECTOR_MODELS[lang][CorporaEnum.WIKI][ VectorModelType.WORD2VEC](name=CorporaEnum.WIKI.value, lang=lang) document = Document(lang=lang, text=text) cna_graph = CnaGraph(docs=document, models=[vector_model]) compute_indices(doc=document, cna_graph=cna_graph) categoriesList = [] complexityIndices = {} for key, value in document.indices.items(): categoryName = key.category.name if (categoryName not in categoriesList): categoriesList.append(categoryName) complexityIndexDTO = ComplexityIndexDTO(repr(key), float(value), type="document") # complexityIndex[categoryName] = complexityIndexDTO if categoryName not in complexityIndices: complexityIndices[categoryName] = [] complexityIndices[categoryName].append(complexityIndexDTO) #data = {} #for key, v in document.indices.items(): #data[repr(key)] = [v] # load the model from disk #loaded_model = pickle.load(open("rb_api/textual_complexity/lsvc.sav", 'rb')) #item = pd.DataFrame.from_dict(data) #level = loaded_model.predict(item)[0] #TO do train a Model level = "C" for paragraph_id, paragraph in enumerate(document.components): for key, value in paragraph.indices.items(): categoryName = key.category.name if (categoryName not in categoriesList): categoriesList.append(categoryName) complexityIndexDTO = ComplexityIndexDTO( repr(key), float(value), type="paragraph", paragraph_index=paragraph_id) # complexityIndex[categoryName] = complexityIndexDTO if categoryName not in complexityIndices: complexityIndices[categoryName] = [] complexityIndices[categoryName].append(complexityIndexDTO) for paragraph_id, paragraph in enumerate(document.components): for sentence_id, sentence in enumerate(paragraph.components): for key, value in sentence.indices.items(): categoryName = key.category.name if (categoryName not in categoriesList): categoriesList.append(categoryName) complexityIndexDTO = ComplexityIndexDTO( repr(key), float(value), type="sentence", paragraph_index=paragraph_id, sentence_index=sentence_id) # complexityIndex[categoryName] = complexityIndexDTO if categoryName not in complexityIndices: complexityIndices[categoryName] = [] complexityIndices[categoryName].append(complexityIndexDTO) # iterate through complexity index array complexityIndicesResponse = [ ComplexityIndicesDTO(category, indices) for category, indices in complexityIndices.items() ] texts = [[sentence.text for sentence in paragraph.components] for paragraph in document.components] textualComplexityDataDTO = TextualComplexityDataDTO( languageString, level, texts, categoriesList, complexityIndicesResponse) textualComplexityResponse = TextualComplexityResponse( textualComplexityDataDTO, "", True) jsonString = textualComplexityResponse.toJSON() # print(textualComplexityResponse) # jsonString = json.dumps(textualComplexityResponse, default=TextualComplexityResponse.dumper, indent=2) return jsonString