def chequear_plagio(plagio_algorithm,doc1,doc2,file,trained_model,UMBRAL_SIMILARIDAD=0.9):

    doc1 = ut.preprocess_text(doc1)
    doc2 = ut.preprocess_text(doc2)

    if plagio_algorithm == 'A':
        return chequear_plagio_word2vec(doc1,doc2,file,trained_model,UMBRAL_SIMILARIDAD)
    else:
        return chequear_plagio_wordnet(doc1,doc2,file,trained_model,UMBRAL_SIMILARIDAD)
Ejemplo n.º 2
0
def generate_ctx_emission_table(lines,
                                mode="en",
                                ctx_mode="prev_word",
                                lower=False,
                                norm_tense=False,
                                replace_number=False,
                                replace_year=False,
                                replace_symbol=False):
    hashmap = {}
    Y = {}
    skipped = []
    word_freq = {"##UNK##": 0}
    for ln, line in enumerate(lines):
        try:
            x, _ = line.split(" ")
            if lines[ln - 1] == "":
                y = "##START##"
            else:
                if ctx_mode == "prev_word":
                    y, _ = lines[ln - 1].split(" ")
                else:
                    _, y = lines[ln - 1].split(" ")
            # x is the word, y is the POS of the prev word
            y = utils.preprocess_text(y, mode, lower, norm_tense,
                                      replace_number, replace_year,
                                      replace_symbol)
            x = utils.preprocess_text(x, mode, lower, norm_tense,
                                      replace_number, replace_year,
                                      replace_symbol)
            if x in word_freq:
                word_freq[x] += 1
            else:
                word_freq[x] = 1
            if y in hashmap:
                if x in hashmap[y]:
                    hashmap[y][x] += 1
                else:
                    hashmap[y][x] = 1
            else:
                hashmap[y] = {}
                hashmap[y][x] = 1
            if y in Y:
                Y[y] += 1
            else:
                Y[y] = 1
        except Exception as e:
            if line not in skipped:
                # print(e)
                skipped.append(line)
    #print("Skipped", len(skipped), "lines: ", skipped)
    return {"x_hashmap": hashmap, "x_word_freq": word_freq, "y_tags": Y}
Ejemplo n.º 3
0
def classify(opts):
    '''
    model and vectorizer must be stored in opts.model_dir directory
    '''
    prop_names = read_prop_names(opts.prop_names_file)
    models = []
    vects = []  #vectorizers
    for name in prop_names:
        model_file = opts.model_dir + "/" + name + ".model"
        vect_file = opts.model_dir + "/" + name + ".vect"
        if os.path.exists(model_file):
            print("Loading " + model_file)
            model = pickle.load(open(model_file, "rb"))
            print("Loading " + vect_file)
            vect = pickle.load(open(vect_file, "rb"))
            models.append(model)
            vects.append(vect)
    print("Classifying")
    out = open(opts.classify_outfile, "w")
    with open(opts.index_file) as lines:
        for line in lines:
            obj = json.loads(line)
            processed_sentences = [
                utils.preprocess_text(i) for i in obj['text']
            ]
            obj['scores'] = []
            for i in range(len(models)):
                X = vects[i].transform(processed_sentences)
                scores = models[i].decision_function(X)
                obj['scores'].append(scores.tolist())
            out.write(json.dumps(obj) + "\n")
    out.close()
Ejemplo n.º 4
0
def predict_rating(review_text,
                   classifier,
                   vectorizer,
                   decision_threshold=0.5):
    """
		- Given the classifier, vectorizer and text, classify whether that text is a positive or
		negative review

		args:
			review_text(str): The review that needs to be classified
			classifier(ReviewClassifier): The model that has been trained for classification
			vectorizer(utils.Vectorizer): The Vectorizer that will be used to convert the text to a vector
		returns:
			class(str): The class, which is either positive or negative
			
	"""

    review_text = preprocess_text(review_text)
    review_vector_np = vectorizer.vectorize(review_text)
    review_vector = torch.from_numpy(review_vector_np)
    result = torch.sigmoid(classifier(review_vector.view(1, -1)))
    class_label = None
    if result.item() < decision_threshold:
        class_label = 0
    else:
        class_label = 1

    return vectorizer.rating_vocab.lookup_index(class_label)
Ejemplo n.º 5
0
def searchresults(search_string, num_results):
    search_string = preprocess_text(search_string)
    search_vect = np.array([question_to_vec(search_string, w2v_model)])
    
    search_results = []
    cosine_similarities = pd.Series(cosine_similarity(search_vect, all_title_embeddings)[0]) 
    #cosine_similarities = cosine_similarities*(0.4*data.overall_scores + 0.1*(data.sentiment_polarity))

    for i,j in cosine_similarities.nlargest(int(num_results)).iteritems():
        output = ''
        for t in data.question_content[i][:200].split():
            if t.lower() in search_string:
                output += " " + str(t)
            else:
                output += " "+str(t)
        temp = {
            'Title': str(data.original_title[i]),
            'url': str(data.question_url[i]),
            'Id': str(i),
            'answer': str(data.answers_content[i]),            
            'Tags': str(data.tags[i]),
            'similarity_score': str(j)[:5],
            'votes': str(data.overall_scores[i]),
            'Body':str(output)
        }
        search_results.append(temp)
    return search_results
Ejemplo n.º 6
0
 def fit_transform(self, texts):
     clean_texts = [' '.join(preprocess_text(t)) for t in texts]
     transformer = CountVectorizer(min_df=3,
                                   max_df=0.7,
                                   preprocessor=lambda x: x,
                                   tokenizer=lambda t: t.split())
     return np.array(transformer.fit_transform(clean_texts).todense())
Ejemplo n.º 7
0
        def _lcs_match(max_dist):
            f.fill(0)
            g.clear()

            ### longest common sub sequence
            # f[i, j] = max(f[i - 1, j], f[i, j - 1], f[i - 1, j - 1] + match(i, j))
            for i in range(N):

                # note(zhiliny):
                # unlike standard LCS, this is specifically optimized for the setting
                # because the mismatch between sentence pieces and original text will
                # be small
                for j in range(i - max_dist, i + max_dist):
                    if j >= M or j < 0: continue

                    if i > 0:
                        g[(i, j)] = 0
                        f[i, j] = f[i - 1, j]

                    if j > 0 and f[i, j - 1] > f[i, j]:
                        g[(i, j)] = 1
                        f[i, j] = f[i, j - 1]

                    f_prev = f[i - 1, j - 1] if i > 0 and j > 0 else 0
                    if (preprocess_text(paragraph_text[i],
                                        lower=config.uncased,
                                        remove_space=False) == tok_cat_text[j]
                            and f_prev + 1 > f[i, j]):
                        g[(i, j)] = 2
                        f[i, j] = f_prev + 1
Ejemplo n.º 8
0
    def fit_transform(self, texts):
        clean_texts = [preprocess_text(t) for t in texts]
        if self.train:
            size = self.d
            self.model = Word2Vec(size=size, workers=8, min_count=3)
            self.model.build_vocab(clean_texts)
            self.model.train(clean_texts, total_examples=len(clean_texts), epochs=10)
        else:
            size = 300
            self.model = api.load('word2vec-google-news-300')

        embs = []
        for text in clean_texts:
            emb = np.zeros(size)
            n = 0
            for w in text:
                if w in self.model:
                    emb += self.model[w]
                    n += 1

            if n != 0:
                emb = emb / n

            embs.append(emb)

        sentence_embeddings = np.array(embs)

        return sentence_embeddings
Ejemplo n.º 9
0
    def predict_sentences_2_idxs(self):
        """Replaces each Quora question with indexes corressponding to
        respective position of tokens in embedding matrix. If include_unknown
        is true, then replaces with corressponding index, ignores otherwise.

        Creates 2 binary files:
        parsed_train_pos.txt: list of lists containing token indexes (integers)
                              of positive class

        parsed_train_neg.txt: list of lists containing token indexes (integers)
                              of negative class
        """
        fo = open(self.config.parsed_predict_file, 'w')
        self.load_dicts()

        questions = pd.read_csv(self.config.predict_file,
                                usecols=["question_text"], index_col=False)
        unk_idx = self.word2idx[self.config.unknown_token]

        for quest in questions.question_text:
            tokens = utils.preprocess_text(quest)
            if self.config.include_unknown:
                idxs = [self.word2idx.get(token, unk_idx) for token in
                        tokens]
            else:
                idxs = [self.word2idx.get(token) for token in tokens]
                idxs = [idx for idx in idxs if idx]
            fo.write((str(" ".join(str(num) for num in idxs)) + "\n"))
Ejemplo n.º 10
0
def clasificar_documento(doc,
                         treshold=0.3,
                         categories=[
                             'economy', 'technology', 'health',
                             'science-environment', 'business', 'politics',
                             'entertainment', 'sport'
                         ]):
    try:
        doc_english = translator.translate(ut.preprocess_text(doc)).text
        sentences = tokenize.sent_tokenize(doc_english)

        sum_scores = inicializar_scores(categories)
        for sent in sentences[5:10]:
            url = 'https://api.dandelion.eu/datatxt/cl/v1'

            payload = {
                'text': sent,
                'model': "54cf2e1c-e48a-4c14-bb96-31dc11f84eac",
                'token': 'cbbf951e9b704ea4a3ddfd09d27bed1d',
                'min_score': treshold
            }

            jsonData = requests.get(url, params=payload).json()
            sum_scores = update_scores(sum_scores, jsonData)

        return get_argmax(sum_scores)
    except:
        print("Error al categorizar el texto.")
Ejemplo n.º 11
0
def main():
    args = parser.parse_args()

    src = Path(args.source_dir)
    dest = Path(args.destination_dir)

    dataset_names = (args.test_dir_name, args.train_dir_name)

    print("Converting raw dataset to rows...\n")
    row_dict = utils.raw_dataset_to_row_dict(
        dataset_root_path=src,
        dataset_dir_names=dataset_names,
    )

    for dataset_name, rows in row_dict.items():
        for i, (text, label) in tqdm(enumerate(rows),
                                     total=len(rows),
                                     desc="Preprocessing text"):
            row_dict[dataset_name][i] = (utils.preprocess_text(text), label)
    print()

    if not os.path.exists(dest):
        print(f"Creating directory {dest} since it doesn't exist...")
        os.makedirs(dest)

    print("Saving datasets as TSV...")
    for dataset_name in dataset_names:
        dataset_dest = dest / f"{dataset_name}.tsv"

        with open(dataset_dest, mode="w+", encoding="utf8",
                  errors="replace") as f:
            csv.writer(f, delimiter="\t").writerows(row_dict[dataset_name])

    print("Done!\n")
Ejemplo n.º 12
0
 def make_dataset(params):
     strings = re.compile('[^a-zA-Z]')
     data_dir = os.path.join('./processed', params["directory"])
     tags = os.listdir(data_dir)
     if params["tags"] is not None:
         tags = params["tags"].split(";")
     if "unclassified" in tags:
         tags.remove("unclassified")
     if len(tags) < 2:
         return None
     db = Database()
     texts = []
     labels = []
     for i, tag in enumerate(tags):
         class_dir = os.path.join(data_dir, tag)
         ids = os.listdir(class_dir)
         for id in ids:
             data, status = db.fetch_document(id)
             if data:
                 try:
                     json_path = os.path.join(data['processed_path'],
                                              data['id'] + '.json')
                     with open(json_path, 'r') as fi:
                         data['content'] = json.loads(fi.read())
                     doc = Document(data)
                     text = doc.get_text()
                     text = utils.preprocess_text(text)
                     texts.append(text)
                     labels.append(i)
                 except Exception as e:
                     pass
     train_x, test_x, train_y, test_y = train_test_split(
         texts, labels, test_size=params["split"])
     return train_x, test_x, train_y, test_y, tags
    def evaluate_line(self, line):
        if isinstance(line, str):
            raw_test_comments = [line]
        elif isinstance(line, (list, tuple)):
            raw_test_comments = [question for question, entity_dict in line]
        else:
            raise ValueError('【格式错误】question 字段值应该为字符串或列表!')
        processed_test_comments = []
        for comment in raw_test_comments:
            processed_test_comments.append(preprocess_text(comment))

        test_sequences = self.tokenizer.texts_to_sequences(
            processed_test_comments)

        final_test_data = pad_sequences(test_sequences, maxlen=150)

        rets = self.model.predict(x=final_test_data, batch_size=1)

        ret = []
        for pred, question in zip(rets, raw_test_comments):
            # argsort函数返回的是数组值从小到大的索引值
            sort_index = pred.argsort()
            pred_ret = [{
                'question': question,
                'intent': self.id2label[_index],
                'score': float(pred[_index])
            } for _index in sort_index[-5:][::-1]]
            ret.append(pred_ret)
            # label = self.id2label[pred.argmax()]
            # score = float(pred.max())
            # ret.append([{'question': question, 'intent': label, 'score': score}])

        logger.info("问句`{}`实体识别的结果:{}".format(line, ret))
        return ret
    def explain(self, text, nwords, return_weights=False):
        '''
        Use `LimeTextExplainer` to obtain the top `nwords` most important/polar words in the `text` as 
        an explanation.


        Parameters
        --------------
        text: str
            The text to explain.

        nwords: int
            The number of most important words to return (i.e. explanation size).

        return_weights: bool
            Set to True to return the weights assigned by LIME also.

        Returns
        ---------------
        word_ranking : list
            Indexes of the `nwords` top-ranked words in the text.
        
        ranked_words: list
            List of `nwords` top-ranked words in the text.

        weights: dict, optional
            The dictionary of weights (wordposition -> weight) assigned by LIME to the words
            in the text.

        explanation: optional
            The explanation object returned by `LimeTextExplainer`.
        '''
        text = preprocess_text(text)
        text_words = get_tokens(text)

        class_names = ['negative', 'positive']
        # bow is set to False because word order is important
        explainer = LimeTextExplainer(class_names=class_names,
                                      feature_selection='auto',
                                      bow=False,
                                      split_expression=' ',
                                      verbose=False)

        explanation = explainer.explain_instance(
            text_instance=text,
            labels=[0, 1],
            classifier_fn=self.predict_texts,
            num_features=nwords,
            num_samples=self.nsamples)
        # sort weights by decreasing absolute value
        weights = OrderedDict(
            sorted(explanation.as_map()[1],
                   key=lambda weight: -abs(weight[1])))
        word_ranking = np.array(list(weights.keys()))
        ranked_words = [text_words[i] for i in word_ranking]
        if return_weights:
            return word_ranking, ranked_words, weights, explanation
        return word_ranking, ranked_words
Ejemplo n.º 15
0
def pico_preprocess(line):
    line = dict(text=line.abstract,
                P=line.population,
                I=line.intervention,
                O=line.outcome)
    if pico_constraint(line):
        return {k: preprocess_text(v) for k, v in line.items()}
    else:
        return line
 def explain_text_words(self, text, rank_by_importance=True):
     '''
     Word level explanation.
     '''
     text = preprocess_text(text)
     text_words = get_tokens(text)
     y = self.model.predict_class(text)
     word_ranking, values = self.sbe(text_words, y, rank_by_importance)
     ranked_words = [text_words[i] for i in word_ranking]
     return word_ranking, ranked_words, values
Ejemplo n.º 17
0
def getsearchresults():
    params = request.json
    if (params == None):
        params = request.args

    query = params["query"]
    query = preprocess_text(query)
    tags = list(predict_tags(query))
    results = searchresults(query, params["num_results"])
    return jsonify({'tags': tags, 'results': results})
Ejemplo n.º 18
0
    def attack(self,text, target_class, search_algorithm, random_attack = False):
        '''
        Attack text to change the prediction to `target_class`.

        Parameters
        -----------------
        text: str
            The text to attack.
        
        target_class: int
            The class to change the classification to.

        search_algorithm: str
            The search algorithm to use in attack the text : greedy or beam.

        random_attack: bool, optional
            Randomly selects words to target for attack

        '''
        text = preprocess_text(text)
        x = get_tokens(text)
        explanation_size = int(self.percentage * len(x))
        if self.explainer is None : # target all words
            print("No explainer provided .  Targeting all words in the input ... ")
            candidate_words_indexes = np.arange(len(x))
            candidate_words = np.array(x)[candidate_words_indexes].tolist()
        elif not random_attack :
            print('Generating explanation...')
            candidate_words_indexes, candidate_words = self.explainer.explain(text, explanation_size)
        else :
            print("Randomly selecting candidate words to perturb...")
            candidate_words_indexes = np.random.choice(len(x), explanation_size , replace = False)
            candidate_words = np.array(x)[candidate_words_indexes].tolist()
        assert len(candidate_words_indexes) == len(candidate_words)
        print("Extracted candidate words: ", candidate_words)
        synonyms_map = self.build_synonyms_map(candidate_words)
        print("Built synonyms map.")
        candidate_replacements = self.get_valid_replacements(x, candidate_words_indexes, synonyms_map)
        print("Filtered replacements.")
        Attacker.print_candidate_stats(candidate_replacements)
        #print("candidate_replacements: ")
        #pprint(candidate_replacements)
        if search_algorithm == 'greedy':
            print('Running greedy search...')
            used_replacements, adversary_found, prediction = self.greedy_search(x,candidate_replacements, target_class)
        elif search_algorithm == 'beam':
            print('Running beam search...')
            used_replacements, adversary_found, prediction = self.beam_search(x, candidate_replacements, target_class)
        else :
            raise ValueError('Invalid search algorithm provided')
        print("Chose replacements.")

        # Generate adversarial text
        adv_text = Attacker.get_adv_text(text, used_replacements)
        return used_replacements, adversary_found, adv_text, prediction
Ejemplo n.º 19
0
def process(fipc):
    global index
    for line in fipc:
        line = line.strip().split()
        ipc = line[0].decode('utf-8')
        desc = " ".join(line[1:])
        normalized = utils.preprocess_text(desc.decode('utf-8')).keys()
        for w in normalized:
            ipclist = index.get(w, [])
            ipclist.append(ipc)
            index[w] = ipclist
Ejemplo n.º 20
0
    def text2idx(self, input_text):
        tokens = utils.preprocess_text(input_text)

        if self.config.include_unknown:
            idxs = [self.word2idx.get(token, self.unk_idx) for token in
                    tokens]
        else:
            idxs = [self.word2idx.get(token) for token in tokens]
            idxs = [idx for idx in idxs if idx]

        return np.array(idxs)
Ejemplo n.º 21
0
    def predict(self, text):
        """
        Predicts class of text and returns the label prediction and the model
        probability of the predicted label.

        If multiple text items are passed, the method returns a tupel
        containing two arrays -- array one contains predicted labels and array
        two contains predicted label probabilities.

        Arguments:
            text (str or list): text to be classified.
        """
        # If self.clean_text_ apply preprocess_text function according to type.
        if self.clean_text_:
            if type(text) == str:
                text = preprocess_text(text)
            else:
                text = [preprocess_text(item_text) for item_text in text]
        # Predict and return text label and probability.
        return self.model_.predict(text)
def collect_quotes(quotes):
    """Structure final quotes as a list of records for display in a table."""
    collection = []
    for q in quotes:
        # Checking for 'PERSON' before assigning a speaker - if the quote is of type 'Heuristic',
        # the conditions are relaxed and we accept the quote with a blank speaker name
        if q.get('named_entity_type') == 'PERSON' or q.get(
                'quote_type') == 'Heuristic':
            speaker = q.get('named_entity', "")
            quote = preprocess_text(q.get('quote', ""))
            collection.append({'speaker': speaker, 'quote': quote})
    return collection
Ejemplo n.º 23
0
def deanonymize_dataset(
    rg_path: str,
    standardized_dataset: Dataset,
    processed_dataset_path: str = None,
    n_samples: int = None,
):
    """Take an anonymized dataset and add back the original dataset columns."""
    assert processed_dataset_path is not None, \
        "Please specify a path to save the dataset."

    # Load the dataset
    dataset = Dataset.load_from_disk(rg_path)

    if n_samples:
        dataset.set_visible_rows(list(range(n_samples)))
        standardized_dataset.set_visible_rows(list(range(n_samples)))

    text_columns = []

    # Add columns from the standardized dataset
    dataset.add_column('document', standardized_dataset['document'])
    text_columns.append('document')

    if 'summary:reference' in standardized_dataset.column_names:
        dataset.add_column('summary:reference',
                           standardized_dataset['summary:reference'])
        text_columns.append('summary:reference')

    # Preprocessing all the text columns
    dataset = dataset.update(
        lambda x:
        {f'preprocessed_{k}': preprocess_text(x[k])
         for k in text_columns})

    # Run the Spacy pipeline on all preprocessed text columns
    try:
        nlp = load('en_core_web_lg')
    except OSError:
        nlp = load('en_core_web_sm')

    nlp.add_pipe('sentencizer', before="parser")
    spacy = Spacy(nlp=nlp)
    dataset = spacy(
        dataset,
        [f'preprocessed_{col}' for col in text_columns],
        batch_size=100,
    )

    # Directly save to disk
    dataset.save_to_disk(processed_dataset_path)

    return dataset
Ejemplo n.º 24
0
    def train(self, train_set: pd.DataFrame, force: bool = False, save: bool = True) -> None:
        if not force and self.LogReg_pipeline is not None:
            return

        self.LogReg_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words=self.stop_words)),
            ('clf', LogisticRegression(solver='sag')),
        ])

        self.LogReg_pipeline.fit(train_set['comment_text'].map(lambda com: utils.preprocess_text(com)),
                                 train_set['bannable'])

        if save:
            utils.dump(self.LogReg_pipeline, "log_pipeline")
def count_missing_words():
    data_model = QuoraQuestionsModel(DataConfig(), 10, 10, 300)
    data_model.load_dicts()
    questions = pd.read_csv(data_model.config.train_file,
                            usecols=["question_text"],
                            index_col=False)
    word_count = defaultdict(int)
    for quest in questions.question_text:
        tokens = preprocess_text(quest)
        for token in tokens:
            if token not in data_model.word2idx:
                word_count[token] += 1

    fo = open("../data/missing_word_counts_new.txt", "wb")
    pickle.Pickler(fo, 4).dump(word_count)
    fo.close()
Ejemplo n.º 26
0
def parse_doc(collection, doc):
    """Perform quote extraction conditionally on one document"""
    try:
        doc_id = str(doc['_id'])

        if doc is None:
            app_logger.error('Document "{0}" not found.'.format(doc_id))
        else:
            text = doc['body']
            text_length = len(text)
            if text_length > MAX_BODY_LENGTH:
                app_logger.warn(
                    'Skipping document {0} due to long length {1} characters'.format(doc['_id'], text_length))
                if update_db:
                    collection.update(
                        {'_id': ObjectId(doc_id)},
                        {
                            '$set': {
                                'lastModifier': 'max_body_len',
                                'lastModified': datetime.now()
                            },
                            '$unset': {
                                'quotes': 1
                            }
                        },
                        upsert=True
                    )
            # Process document
            doc_text = utils.preprocess_text(doc['body'])
            spacy_doc = nlp(doc_text)

            quotes = extract_quotes(doc_id=doc_id, doc=spacy_doc, write_tree=write_quote_trees_in_file)
            if update_db:
                collection.update(
                    {'_id': ObjectId(doc_id)},
                    {'$set': {
                        'quotes': quotes,
                        'lastModifier': 'quote_extractor',
                        'lastModified': datetime.now()}})
            else:
                # If dry run, then display extracted quotes (for testing)
                print('=' * 20, ' Quotes ', '=' * 20)
                for q in quotes:
                    print(q, '\n')
    except:
        app_logger.exception("message")
        traceback.print_exc()
Ejemplo n.º 27
0
    def fix(self, text, target_class, beam_size = 4, random_fix = False):
        '''
        Change the classification of a text to the correct class.

        Parameters
        ------------
        text: str
            The text that is misclassified.
        
        target_class: int
            The label of the class to change the prediction to

        beam_size: int

        random_fix: Boolean, Optional
            If set to True, words will be targeted randomly for replacement.


        Returns
        ----------------
        suggestions: list
            The list of suggested replacement sets.


        '''
        text = preprocess_text(text)
        x = get_tokens(text)
        explanation_size = int(self.percentage * len(x))
        if self.explainer is None : # target all words
            print("No explainer provided .  Targeting all words in the input ... ")
            candidate_words_indexes = np.arange(len(x))
            candidate_words = np.array(x)[candidate_words_indexes].tolist()
        elif not random_fix :
            print('Generating explanation...')
            candidate_words_indexes, candidate_words = self.explainer.explain(text, explanation_size)
        else :
            print("Randomly selecting candidate words to perturb...")
            candidate_words_indexes = np.random.choice(len(x), explanation_size , replace = False)
            candidate_words = np.array(x)[candidate_words_indexes].tolist()
        print("Extracted candidate words: ", candidate_words)
        synonyms_map = self.build_synonyms_map(candidate_words)
        print("Built synonyms map.")
        candidate_replacements = self.get_valid_replacements(x, candidate_words_indexes, synonyms_map)
        print('Filtered replacements.')
        print('Running beam search...')
        suggestions = self.beam_search(x, candidate_replacements, target_class, beam_size = beam_size, return_multiple = True)
        return suggestions
Ejemplo n.º 28
0
def similarity_check(ques_text, ref_ques_dict):
    '''
	for a given ques text, checks if its sum
	of similarity scores with all reference
	questions is less than min score, if yes
	takes each word from the ques and finds
	similar words, also tries spell check
	and returns a list of possible words
	similar to original ques text

	rtype: list of str
	'''
    if not ques_text:
        return ''
    # remove trailing spaces, spl characters
    ques_text = preprocess_text(ques_text)

    sim_words = []
    # get the similarity scores from
    # the string matching algorithms
    sim_scores = get_sim_ref_ques(ques_text)

    if sum(sim_scores.values()) <= min_sim_score:
        # if the similarity score is very low
        # lower than minimum accepted, try
        # finding similar words as a fallback
        # option #1
        sim_words = get_similar_words_sent(ques_text, max_sim_words, min_count)

        if not sim_words:
            # if there are no similar words
            # there is a possibility that
            # this word has been misspelled
            # try to do a spell check
            spelled = spell_check(ques_text)
            sim_words = [spelled]

    if sim_words:
        # if either the bigrams from gensim
        # word vec or spell check generated
        # similar words, add to the original
        # question text
        ques_text = str(ques_text).replace('"', '')
        new_text = str(" ".join(sim_words))
        ques_text = ques_text + " " + new_text

    return ques_text
    def predict(self, tweet, seq_length):
        tweet = preprocess_tweet(tweet, punctuation=True)

        tweet = preprocess_text(tweet)

        tokens = [tokenize_custom(tweet, self.vocab_to_int)]

        features = pad_features(tokens, seq_length=seq_length)

        self.cuda()
        with torch.no_grad():
            h = self.init_hidden(1)
            output, h = self(
                torch.from_numpy(features).type(torch.cuda.LongTensor), h)

            softmax = nn.Softmax(dim=1)
        return softmax(output).cpu().numpy()
Ejemplo n.º 30
0
def predict():

    checkpoint_file = os.path.join(MODEL_PATH, CHECKPOINT_FILE)
    classes_to_labels_flie = os.path.join(MODEL_PATH, LABELS_FILE)
    embedding_matrix_file = os.path.join(MODEL_PATH, EMBEDDING_MATRIX_FILE)
    # model_file = os.path.join(MODEL_PATH, 'model.pkl')
    tokenizer_file = os.path.join(MODEL_PATH, TOKENIZER_FILE)

    predicate_label = pickle.load(open(classes_to_labels_flie, 'rb'), encoding="iso-8859-1")
    embedding_matrix = pickle.load(open(embedding_matrix_file, 'rb'), encoding="iso-8859-1")
    # model = pickle.load(open(model_file, 'rb'), encoding="iso-8859-1")
    nb_words, EMBEDDING_DIM = embedding_matrix.shape
    label2id = {k: t.argmax() for k, t in predicate_label.items()}
    id2label = {_id: label for label, _id in label2id.items()}

    model = make_model(nb_words, EMBEDDING_DIM, embedding_matrix, len(predicate_label))
    model.load_weights(checkpoint_file)
    # model = load_model(checkpoint_file)

    tokenizer = pickle.load(open(tokenizer_file, 'rb'), encoding="iso-8859-1")

    test_data = read_data(DEV_FILE)

    raw_test_comments = [t[0] for t in test_data]

    test_y = np.array([predicate_label[t[1]] for t in test_data])

    processed_test_comments = []
    for comment in raw_test_comments:
        processed_test_comments.append(preprocess_text(comment))

    test_sequences = tokenizer.texts_to_sequences(processed_test_comments)

    final_test_data = pad_sequences(test_sequences, maxlen=150)
    # print('test_data', test_data[:3])
    print('模型评估')
    ret = model.predict(x=final_test_data, batch_size=1)
    # print('预测结果:', ret)
    # print('标注', '预测', '问题')
    rets = []
    for label, pred, question in zip(test_y, ret, test_data):
        print(id2label[label.argmax()], id2label[pred.argmax()], question)
        rets.append([id2label[label.argmax()], id2label[pred.argmax()], question])

    print('正确率:{}'.format(len([t for t in rets if t[0]==t[1]])/len(rets)))