コード例 #1
0
def compute_textual_indices(text):
    lang = str_to_lang("de")
    model = get_default_model(lang)
    #model =  VECTOR_MODELS[lang][CorporaEnum.WIKI][VectorModelType.WORD2VEC](
    #        name=CorporaEnum.WIKI.value, lang=Lang.DE)

    doc = Document(lang, text)
    cna_graph = CnaGraph(docs=doc, models=[model])
    compute_indices(doc=doc, cna_graph=cna_graph)

    block = []

    for b in doc.get_blocks():
        sent = []
        for s in b.get_sentences():
            sent.append(s.text)
        block.append({'text': b.text, 'sentences': sent})

    feedback_text = {'doc': doc.text, 'blocks': block}

    sentences = [sent.indices for sent in doc.get_sentences()]
    blocks = [block.indices for block in doc.get_blocks()]

    return {
        'text': feedback_text,
        'indices': {
            'document': doc.indices,
            'sentence': sentences,
            'block': blocks
        }
    }
コード例 #2
0
def modify_sentence(sent: str):
    global args, speller, word_set, LOWER, MATCH_ALPHA_WORD

    doc = Document(text=sent, lang=Lang.RO)
    tokens = [token for token in doc.get_words()]
    tokenst = [token.text for token in doc.get_words()]

    tokents = modify_words(tokenst)
    tokents = modify_chars(tokenst)

    return reconstruct_sentence(tokenst)
コード例 #3
0
def textSimilarityPost():
    params = json.loads(request.get_data())
    corpus = params.get('corpus') if params.get(
        'corpus') != None else 'le_monde_small'
    languageString = params.get('language')
    lang = str_to_lang(languageString)
    texts = params.get('texts')

    vectorModels = []
    try:
        vectorModel = LSA(corpus, lang)
        vectorModels.append(vectorModel)
    except FileNotFoundError as inst:
        print(inst)

    try:
        vectorModel = LDA(corpus, lang)
        vectorModels.append(vectorModel)
    except FileNotFoundError as inst:
        print(inst)

    try:
        vectorModel = Word2Vec(corpus, lang)
        vectorModels.append(vectorModel)
    except FileNotFoundError as inst:
        print(inst)

    noTexts = len(texts)
    pairs = []
    for i in range(0, noTexts):
        document1 = Document(lang, texts[i])
        for j in range(i + 1, noTexts):
                document2 = Document(lang, texts[j])
                scores = []
                for vectorModel in vectorModels:
                        similarityScore = vectorModel.similarity(
                            document1, document2)
                        scoreDTO = ScoreDTO(vectorModel.type.name, similarityScore)
                        scores.append(scoreDTO)
                pairDTO = PairDTO(i, j, scores)
                pairs.append(pairDTO)

    # print(pairs)
    scoresDTO = ScoresDTO(lang, corpus, pairs)
    textSimilarityResponse = TextSimilarityResponse(scoresDTO, "", True)
    jsonString = textSimilarityResponse.toJSON()

    return jsonString
コード例 #4
0
def compute_graph(dataName, JsonName, texts: List[str], lang: Lang,
                  models: List) -> str:
    docs = [Document(lang=lang, text=text) for text in texts]
    models = [
        create_vector_model(lang, VectorModelType.from_str(model["model"]),
                            model["corpus"]) for model in models
    ]
    models = [model for model in models if model is not None]
    graph = CnaGraph(docs=docs, models=models)
    sentence_index = 1
    doc_index = 1
    names = {}
    for doc_index, doc in enumerate(docs):
        names[doc] = "Document {}".format(doc_index + 1)
        for paragraph_index, paragraph in enumerate(doc.components):
            names[paragraph] = "Paragraph {}.{}".format(
                doc_index + 1, paragraph_index + 1)
            for sentence_index, sentence in enumerate(paragraph.components):
                names[sentence] = "Sentence {}.{}.{}".format(
                    doc_index + 1, paragraph_index + 1, sentence_index + 1)
    result = {
        "data": {
            "name": "Document Set",
            "value": None,
            "type": None,
            "importance": None,
            "children":
            [encode_element(doc, names, graph, lang) for doc in docs]
        }
    }
    edges = {}
    for a, b, data in graph.graph.edges(data=True):
        if data["type"] is not EdgeType.ADJACENT and data[
                "type"] is not EdgeType.PART_OF:
            if data["type"] is EdgeType.COREF:
                edge_type = EdgeType.COREF.name
            else:
                edge_type = "{}: {}".format(data["type"].name,
                                            data["model"].name)
            if (names[a], names[b]) not in edges:
                edges[(names[a], names[b])] = []
            edge = {
                "name": edge_type,
                "weight": str(data["value"]) if "value" in data else None,
                "details": data["details"] if "details" in data else None,
            }
            edges[(names[a], names[b])].append(edge)
    edges = [{
        "source": pair[0],
        "target": pair[1],
        "types": types,
    } for pair, types in edges.items()]
    compute_nxGraph(dataName, JsonName, docs, names, graph, edges, lang)
    result["data"]["edges"] = edges
    return result
コード例 #5
0
def compute_indices_format(text):
    lang = str_to_lang("de")
    model = get_default_model(lang)
    #model =  VECTOR_MODELS[lang][CorporaEnum.WIKI][VectorModelType.WORD2VEC](
    #        name=CorporaEnum.WIKI.value, lang=Lang.DE)
    doc = Document(Lang.DE, text)
    cna_graph = CnaGraph(docs=doc, models=[model])
    compute_indices(doc=doc, cna_graph=cna_graph)

    indices = {}
    for key, value in doc.indices.items():
        indices.update({str(key): value})
    return indices
コード例 #6
0
def filter_by_similarity(kept_lessons, text, models, lang, otherdomains):
    aux_lessons = copy.deepcopy(kept_lessons)
    print(aux_lessons)
    if otherdomains:
        for lesson in aux_lessons:
            is_similar = 0
            print(lesson['published_title'])
            for learn in lesson['learn_details']:
                document1 = Document(lang, text)
                document2 = Document(lang, learn)

                for vectorModel in models:
                    similarity_score = vectorModel.similarity(
                        document1, document2)

                    if similarity_score > threshold_other:
                        is_similar = 1

                print(similarity_score)
            if not is_similar:
                kept_lessons.remove(lesson)
    else:
        for lesson_descriptives, lesson in aux_lessons.items():
            document1 = Document(lang, text)
            document2 = Document(lang, lesson.description)
            is_similar = 0

            for vectorModel in models:
                similarity_score = vectorModel.similarity(
                    document1, document2)

                if similarity_score > threshold:
                    lesson.similarityScore = similarity_score
                    is_similar = 1

            if not is_similar:
                del kept_lessons[lesson_descriptives]
    print(len(kept_lessons))
    return kept_lessons
コード例 #7
0
def compute_indices_format_array(questions):
    lang = str_to_lang("de")
    model = get_default_model(lang)
    #model =  VECTOR_MODELS[lang][CorporaEnum.WIKI][VectorModelType.WORD2VEC](
    #        name=CorporaEnum.WIKI.value, lang=Lang.DE)
    result = []
    for question in questions:
        doc = Document(Lang.DE, question['expert'])
        cna_graph = CnaGraph(docs=doc, models=[model])
        compute_indices(doc=doc, cna_graph=cna_graph)
        doc_indices = {}
        for key, value in doc.indices.items():
            doc_indices.update({str(key): value})

        doc = Document(Lang.DE, question['text'])
        cna_graph = CnaGraph(docs=doc, models=[model])
        compute_indices(doc=doc, cna_graph=cna_graph)

        block = []

        for b in expert.get_blocks():
            sent = []
            for s in b.get_sentences():
                sent.append(s.text)
            block.append({'text': b.text, 'sentences': sent})

        feedback_text = {'doc': doc.text, 'blocks': block}
        sentences = [sent.indices for sent in doc.get_sentences()]
        blocks = [block.indices for block in doc.get_blocks()]

        doc_indices = {
            'text': feedback_text,
            'indices': {
                'document': expert.indices,
                'sentence': sentences,
                'block': blocks
            }
        }
        level = predictLevel(doc_indices['indices']['document'])
        result.append({
            'feedback': compare_feedback(expert, doc_indices),
            'level': level
        })

    return result
コード例 #8
0
def textualComplexityPost():
    params = json.loads(request.get_data())
    text = params.get('text')
    languageString = params.get('language')
    lang = str_to_lang(languageString)
    # lsa = params.get('lsa')
    # lda = params.get('lda')
    # w2v = params.get('w2v')

    if lang is Lang.RO:
        vector_model = VECTOR_MODELS[lang][CorporaEnum.README][
            VectorModelType.WORD2VEC](name=CorporaEnum.README.value, lang=lang)
    elif lang is Lang.EN:
        vector_model = VECTOR_MODELS[lang][CorporaEnum.COCA][
            VectorModelType.WORD2VEC](name=CorporaEnum.COCA.value, lang=lang)
    elif lang is Lang.ES:
        vector_model = VECTOR_MODELS[lang][CorporaEnum.JOSE_ANTONIO][
            VectorModelType.WORD2VEC](name=CorporaEnum.JOSE_ANTONIO.value,
                                      lang=lang)
    elif lang is Lang.FR:
        vector_model = VECTOR_MODELS[lang][CorporaEnum.LE_MONDE][
            VectorModelType.WORD2VEC](name=CorporaEnum.LE_MONDE.value,
                                      lang=lang)
    elif lang is Lang.RU:
        vector_model = VECTOR_MODELS[lang][CorporaEnum.RNC_WIKIPEDIA][
            VectorModelType.WORD2VEC](name=CorporaEnum.RNC_WIKIPEDIA.value,
                                      lang=lang)
    elif lang is Lang.DE:
        vector_model = VECTOR_MODELS[lang][CorporaEnum.WIKI][
            VectorModelType.WORD2VEC](name=CorporaEnum.WIKI.value, lang=lang)

    document = Document(lang=lang, text=text)
    cna_graph = CnaGraph(docs=document, models=[vector_model])
    compute_indices(doc=document, cna_graph=cna_graph)

    categoriesList = []
    complexityIndices = {}
    for key, value in document.indices.items():
        categoryName = key.category.name
        if (categoryName not in categoriesList):
            categoriesList.append(categoryName)

        complexityIndexDTO = ComplexityIndexDTO(repr(key),
                                                float(value),
                                                type="document")
        # complexityIndex[categoryName] = complexityIndexDTO
        if categoryName not in complexityIndices:
            complexityIndices[categoryName] = []
        complexityIndices[categoryName].append(complexityIndexDTO)

    #data = {}
    #for key, v in document.indices.items():
    #data[repr(key)] = [v]

    # load the model from disk
    #loaded_model = pickle.load(open("rb_api/textual_complexity/lsvc.sav", 'rb'))

    #item = pd.DataFrame.from_dict(data)
    #level = loaded_model.predict(item)[0]
    #TO do train a Model
    level = "C"

    for paragraph_id, paragraph in enumerate(document.components):
        for key, value in paragraph.indices.items():
            categoryName = key.category.name
            if (categoryName not in categoriesList):
                categoriesList.append(categoryName)

            complexityIndexDTO = ComplexityIndexDTO(
                repr(key),
                float(value),
                type="paragraph",
                paragraph_index=paragraph_id)
            # complexityIndex[categoryName] = complexityIndexDTO
            if categoryName not in complexityIndices:
                complexityIndices[categoryName] = []
            complexityIndices[categoryName].append(complexityIndexDTO)

    for paragraph_id, paragraph in enumerate(document.components):
        for sentence_id, sentence in enumerate(paragraph.components):
            for key, value in sentence.indices.items():
                categoryName = key.category.name
                if (categoryName not in categoriesList):
                    categoriesList.append(categoryName)

                complexityIndexDTO = ComplexityIndexDTO(
                    repr(key),
                    float(value),
                    type="sentence",
                    paragraph_index=paragraph_id,
                    sentence_index=sentence_id)
                # complexityIndex[categoryName] = complexityIndexDTO
                if categoryName not in complexityIndices:
                    complexityIndices[categoryName] = []
                complexityIndices[categoryName].append(complexityIndexDTO)

    # iterate through complexity index array
    complexityIndicesResponse = [
        ComplexityIndicesDTO(category, indices)
        for category, indices in complexityIndices.items()
    ]
    texts = [[sentence.text for sentence in paragraph.components]
             for paragraph in document.components]

    textualComplexityDataDTO = TextualComplexityDataDTO(
        languageString, level, texts, categoriesList,
        complexityIndicesResponse)

    textualComplexityResponse = TextualComplexityResponse(
        textualComplexityDataDTO, "", True)
    jsonString = textualComplexityResponse.toJSON()
    # print(textualComplexityResponse)
    # jsonString = json.dumps(textualComplexityResponse, default=TextualComplexityResponse.dumper, indent=2)

    return jsonString