Esempio n. 1
0
    def analyze_text(self, text):
        words = get_words(text)
        char_count = get_char_count(words)
        word_count = len(words)
        sentence_count = len(get_sentences(text))
        syllable_count = count_syllables(words)
        complexwords_count = count_complex_words(text)
        avg_words_p_sentence = word_count / sentence_count

        self.analyzedVars = {
            'words': words,
            'char_cnt': float(char_count),
            'word_cnt': float(word_count),
            'sentence_cnt': float(sentence_count),
            'syllable_cnt': float(syllable_count),
            'complex_word_cnt': float(complexwords_count),
            'avg_words_p_sentence': float(avg_words_p_sentence)
        }

        outData = {
            'char_cnt': float(char_count),
            'word_cnt': float(word_count),
            'sentence_cnt': float(sentence_count),
            'syllable_cnt': float(syllable_count),
            'complex_word_cnt': float(complexwords_count),
            'avg_words_p_sentence': float(avg_words_p_sentence)
        }
        return outData
    def analyze_text(self, text):
        words = get_words(text)
        char_count = int(get_char_count(words))
        word_count = int(len(words))
        sentences = get_sentences(text)
        len_sentences = len(sentences)
        sentence_count = int(len_sentences)
        # sentence_count = int(len(get_sentences(text)))
        syllable_count = count_syllables(words)
        complexwords_count = count_complex_words(text.decode('utf-8'))
        avg_words_p_sentence = word_count / sentence_count
        encoding_dict = detect_encoding(self.filename)

        self.analyzedVars = {
            'filename': self.filename,
            # 'text_truncated': text[:200].replace("\n", " "),
            'words': words,
            'char_cnt': float(char_count),
            'word_cnt': float(word_count),
            'sentence_cnt': float(sentence_count),
            'syllable_cnt': float(syllable_count),
            'complex_word_cnt': float(complexwords_count),
            'avg_words_p_sentence': float(avg_words_p_sentence),
            'encoding': encoding_dict['encoding'],
            'encoding_confidence': encoding_dict['confidence']
        }
Esempio n. 3
0
    def __read_files(self, opinions_path):
        ''' Read the files (opinions) '''
        files = sorted(os.listdir(opinions_path))
        all_words = {}

        for file_name in files:
            sentences = utils.get_sentences(
                os.path.join(opinions_path, file_name))
            stars = self.__aspect_manager.get_stars_review(
                self.__name,
                re.match('(.+)\.txt', file_name).group(1))

            review_words = []
            for (id_sentence, text_sentence) in sentences:
                id_general = id_sentence + "_" + file_name
                clean_text = self.__clean_text(text_sentence)
                review_words += clean_text
                self.__sentence_list[id_general] = {
                    'clean_text': clean_text,
                    'raw_text': text_sentence,
                    'stars': stars,
                    'tfidf_sentence': 0,
                    'tfidf_words': {}
                }

            for word in review_words:
                if word not in all_words: all_words[word] = []
                if file_name not in all_words[word]:
                    all_words[word].append(file_name)

        self.__calculate_tfidf(float(len(files)), all_words)
Esempio n. 4
0
def build_vocabulary():
    pathlib.Path(params.data_path).parent.mkdir(parents=True, exist_ok=True)
    pathlib.Path(params.save_path).parent.mkdir(parents=True, exist_ok=True)

    model = None
    if os.path.isfile(params.save_path):
        if not yes_or_no("Vocabulary existed! Do you want to overwrite?"):
            model = Word2Vec.load(params.save_path)

    if model is None:
        items = pickle.load(open(params.data_path, 'rb'))
        normalized_sentences = get_sentences(items)
        del items

        stopwords = load_stopwords()
        sentence_iterator = DictionaryGenerator(
            normalized_sentences,
            stopwords=stopwords)
        del normalized_sentences
        print("\nTotal of sentences: %d" % len(sentence_iterator.sentences))

        model = Word2Vec(sentence_iterator, seed=3695, min_count=1, sg=params.alg, size=params.word_embedding_size,
                         window=params.window_size, sample=params.sample_rate, negative=params.n_negative,
                         workers=max(1, multiprocessing.cpu_count()), iter=params.n_epochs)

        sentence_iterator.save_special_words()
        # del sentences
        del sentence_iterator

        print("Saving dictionary at " + params.save_path)
        model.save(params.save_path)

    word_vectors = model.wv
    del model
    print("Done. Vocabulary size is: %d" % len(word_vectors.vocab))
Esempio n. 5
0
def get_last_summaries(text, final_lis, stopwords, model):
    """
    获取最终的摘要列表
    :param stopwords: 停用词
    :param model: 词向量模型
    :return: 摘要列表[摘要1,摘要2...]
    """
    #判断是否用MMR
    if GlobalParameters.use_MMR:
        results = MMR(final_lis, stopwords, model)
    else:
        results = final_lis[:GlobalParameters.last_num]
        #注意此处的results 是以元祖为元素的列表 需要将句子取出来
        results = [x[0] for x in results]
    #为了使得句子读起来连贯 我们按照摘要句子在原始文章里的位置信息 进行排序
    sentences = utils.get_sentences(text)  #[(1,句子1),(2,句子2)。。]
    # print("句子是",sentences)
    #定义摘要列表 [(句子,位置),(句子,位置)..]
    summaries = []

    for summary in results:
        for sentence in sentences:
            if summary == sentence[1]:
                summaries.append((summary, sentence[0]))
    # print("summaries:",summaries)
    #根据位置排序
    summaries = sorted(summaries, key=lambda x: x[1])

    #获取最终摘要句子 不要位置信息
    summaries = [x[0] for x in summaries]

    return summaries
Esempio n. 6
0
def doc_vector(text,stop_words,model):
    """
    计算文档向量,句子向量求平均
    :param text: 需要计算的文档
    :param stop_words:停用词表
    :param model:词向量模型
    :return:文档向量
    """

    #获取(位置,句子)列表
    sen_lis = utils.get_sentences(text)
    #提取出句子
    sen_lis = [x[1] for x in sen_lis]
    #定义一个文档初始化向量 100是根据训练的词向量的维度来的
    vector = np.zeros(100,)
    #计算文档里包含多少句子
    length = len(sen_lis)
    #遍历所有句子
    for sentence in sen_lis:
        #获取句向量
        sen_vec = sentence_vector(sentence,stop_words,model)
        #计算文档向量
        vector += sen_vec
        # print(vector)

    #返回文档向量
    return vector/length
Esempio n. 7
0
    def __read_files(self, opinions_path):
        files = sorted(os.listdir(opinions_path))

        for file_name in files:
            sentences = utils.get_sentences(
                os.path.join(opinions_path, file_name))
            for (id_sentence, text_sentence) in sentences:
                self.__process_sentence(file_name, id_sentence, text_sentence)
Esempio n. 8
0
 def __read_files(self, opinions_path):
     ''' Read the files (opinions) '''
     files = sorted(os.listdir(opinions_path))
     for file_name in files:
         sentences = utils.get_sentences(os.path.join(opinions_path, file_name))
         for (id_sentence, text_sentence) in sentences:
             annotations = self.__aspect_manager.get_data_sentence(self.__name,  re.match('(.+)\.txt', file_name).group(1), id_sentence)['annotations']
             self.__process_annotations(file_name+'_'+id_sentence, annotations, text_sentence)
Esempio n. 9
0
def generate_summary(topic_file, to_summarize):
    all_sents = utils.get_sentences(to_summarize)
    tw_dict = load_topic_words(topic_file)
    sent_dict = generate_sentence_dict(all_sents, tw_dict)
    top_sents = sorted(sent_dict.items(), key=lambda t: t[1], reverse=True)
    pretty = []
    for sent in top_sents:
        if not utils.is_repeat(sent[0], pretty):
            pretty.append(sent[0])
    # return 100 words
    return " ".join(word_tokenize(" ".join(pretty))[:100])
Esempio n. 10
0
def generate_summary(topic_file, to_summarize):
    all_sents = utils.get_sentences(to_summarize)
    tw_dict = load_topic_words(topic_file)
    sent_dict = generate_sentence_dict(all_sents, tw_dict)
    top_sents = sorted(sent_dict.items(), key=lambda t: t[1], reverse=True)
    pretty = []
    for sent in top_sents:
        if not utils.is_repeat(sent[0], pretty):
            pretty.append(sent[0])
    # return 100 words
    return " ".join(word_tokenize(" ".join(pretty))[:100])
Esempio n. 11
0
def search(google, query, keywords):
    print "\nSearching \"%s\"" % (query)
    all_sentences = []
    for start in xrange(1, 31, 10):
        print(start / 10) + 1,
        try:
            result = google.search(query, start=start)
            sentences = utils.get_sentences(result)
            all_sentences.extend(sentences)
        except HttpError as e:
            print "\nERROR: {}\n".format(e)
            break
            # sys.exit()
    filtered_sentences = utils.filter_sentences(list(set(all_sentences)),
                                                keywords)
    return all_sentences, filtered_sentences
def search(google, query, keywords):
    print "\nSearching \"%s\"" % (query)
    all_sentences = []
    for start in xrange(1, 31, 10):
        print (start / 10) + 1,
        try:
            result = google.search(query, start=start)
            sentences = utils.get_sentences(result)
            all_sentences.extend(sentences)
        except HttpError as e:
            print "\nERROR: {}\n".format(e)
            break
            # sys.exit()
    filtered_sentences = utils.filter_sentences(list(set(all_sentences)),
                                                keywords)
    return all_sentences, filtered_sentences
def build_batch(items, wv, min_doc_length, common_words, uncommon_words):
    batch = {}
    count = 0

    docs_sentences = get_sentences(items)
    for newsId, sentences in docs_sentences.items():
        print_inline('Pre-process items {}/{}'.format(count, len(docs_sentences)))
        count += 1

        words = [w for s in sentences for w in s.strip().split()]
        if len(words) < min_doc_length:
            continue

        words_indices = [get_word_index(wv, word, common_words, uncommon_words) for word in words]
        batch[newsId] = words_indices

    return batch
Esempio n. 14
0
    def analyze_text(self, text):
        words = get_words(text)
        char_count = get_char_count(words)
        word_count = len(words)
        sentence_count = len(get_sentences(text))
        syllable_count = count_syllables(words)
        complexwords_count = count_complex_words(text)
        avg_words_p_sentence = word_count/sentence_count

        self.analyzedVars = {
            'words': words,
            'char_cnt': float(char_count),
            'word_cnt': float(word_count),
            'sentence_cnt': float(sentence_count),
            'syllable_cnt': float(syllable_count),
            'complex_word_cnt': float(complexwords_count),
            'avg_words_p_sentence': float(avg_words_p_sentence)
        }
def execute(treebank, dev):
    print "reading treebank..."
    parses = utils.read_parses_no_indent(treebank)
    parse_lists = []
    for parse in parses:
        parse_lists.append(utils.make_parse_list(parse))
      
    print "learning pcfg..."  
    nonterms, terms, start, prob = grammar.learn(parse_lists)
    
    print "learning hmm..."
    emission, transition = sequnece_labeler.learn(parse_lists)

    print "reading dev data..."
    dev_sentences = utils.get_sentences(dev)
    print dev_sentences[100] 
    for sentence in dev_sentences:
        parse = cky.run(sentence, nonterms, start, prob)
        sequnece = viterbi.run(sentence, emission, transition)
Esempio n. 16
0
 def analyze_text(self, text):
     words = get_words(text)
     char_count = get_char_count(words)
     words_count = len(words)
     sentence_count = len(get_sentences(text))
     syllable_count = count_syllables(words)
     print("syllable_count:", syllable_count)
     complex_words_count = count_complex_words(text)
     avg_words_per_sentence = int(words_count / sentence_count)
     print("avg_words_per_sentence", avg_words_per_sentence)
     self.ana_vars = {
         'words': words,
         'char_count': float(char_count),
         'words_count': float(words_count),
         'sentence_count': float(sentence_count),
         'syllable_count': float(syllable_count),
         'complex_words_count': float(complex_words_count),
         'avg_words_per_sentence': float(avg_words_per_sentence)
     }
Esempio n. 17
0
def main():

    snli_data = utils.get_sentences(file_path='data/snli_sentences_all.txt')

    print('[INFO] Number of sentences = {}'.format(len(snli_data)))

    sentences = [s.strip() for s in snli_data]

    np.random.shuffle(sentences)
    sentences = [word_tokenize(s) for s in sentences]
    w2v_model = gensim.models.Word2Vec(sentences,
                                       size=300,
                                       min_count=1,
                                       iter=50)
    if not os.path.exists('w2v_models'):
        os.mkdir('w2v_models')

    w2v_model.save('w2v_models/w2v_300d_snli_all_sentences.pkl')
    print('[INFO] Word embeddings pre-trained successfully')
    def analyze_text(self, text):
        words = get_words(text)
        char_count = int(get_char_count(words))
        word_count = int(len(words))
        sentence_count = int(len(get_sentences(text)))
        syllable_count = count_syllables(words)
        complexwords_count = count_complex_words(text)
        avg_words_p_sentence = word_count / sentence_count

        self.analyzedVars = {
            'filename': self.filename,
            # 'text_truncated': text[:200].replace("\n", " "),
            'words': words,
            'char_cnt': float(char_count),
            'word_cnt': float(word_count),
            'sentence_cnt': float(sentence_count),
            'syllable_cnt': float(syllable_count),
            'complex_word_cnt': float(complexwords_count),
            'avg_words_p_sentence': float(avg_words_p_sentence)
        }
Esempio n. 19
0
    def handle_submit(self):
        filename_sentence_pairs = utils.get_sentences(self.text_input.value)
        sentences = [pair[1] for pair in filename_sentence_pairs]
        encodings = utils.get_encodings(sentences)
        embedding = self.projector.fit_transform(encodings)

        tooltip_sentences = []
        for sentence in sentences:
            index = sentence.lower().index(self.text_input.value)
            subsentence = sentence[max(index-settings.TEXT_TOOLTIP_WINDOW_SIZE, 0):\
                                   min(settings.TEXT_TOOLTIP_WINDOW_SIZE+index, len(sentence))]
            if len(subsentence) < len(sentence):
                subsentence = '...' + subsentence + '...'
            tooltip_sentences.append(subsentence)

        filename_to_color = dict()
        for filename, _ in filename_sentence_pairs:
            if filename not in filename_to_color:
                filename_to_color[filename] = self.COLORS[len(
                    filename_to_color)]
        color = [
            filename_to_color[filename]
            for filename, _ in filename_sentence_pairs
        ]

        source = ColumnDataSource(data=dict(x=embedding[:, 0],
                                            y=embedding[:, 1],
                                            text=tooltip_sentences,
                                            fill_color=color))

        figure = Figure(tooltips=[('text', '@text')])
        figure.circle('x',
                      'y',
                      fill_color='fill_color',
                      radius=0.05,
                      line_color=None,
                      source=source)
        self.layout.children[-1] = column(figure, sizing_mode='stretch_both')
Esempio n. 20
0
def flesch_kincaid_score(article):
	xml_url = '&titles='.join([xml_api_url, title])
	try:
		xml = requests.get(xml_url).content
		bs = BeautifulSoup(xml)

		try:
			text = str(bs.find('extract').contents[0].encode('utf-8'))	# convert NavigableString to string after encoding
			non_text = ['== See also ==\n', '== References ==\n', ' === Further references ===\n', '== External links ==\n', '== Notes ==\n']
			for ele in non_text:
				text = text.split(ele, 1)[0]
			text = re.sub('==.*==', '', text)
			words = get_words(text)
			syllableCount = count_syllables(text)
			sentences = get_sentences(text)
			fk = 206.835 - 1.015*len(words)/len(sentences) - 84.6*(syllableCount)/len(words)
			return float(format(fk,'.2f'))
		except:
			print 'Error while computing fk score of ' + article
			print format_exc()

	except:
		print 'Error while fetching xml content of ' + article
		print format_exc()
Esempio n. 21
0
    for sentence, tag in zip(sentences, tags):
        formated_sentence = []
        formated_tag = []
        for (token, ), tag_ in zip(sentence, tag):
            max_len = max(len(token), len(tag_)) + 4
            formated_sentence.append(fill(token, max_len))
            formated_tag.append(fill(tag_, max_len))

        no_lines = len(formated_sentence) // max_words_per_line + (
            0 if len(formated_sentence) % max_words_per_line == 0 else 1)

        for i in range(no_lines):
            print(' '.join(formated_sentence[max_words_per_line *
                                             i:max_words_per_line * (i + 1)]))
            print(' '.join(formated_tag[max_words_per_line *
                                        i:max_words_per_line * (i + 1)]))
            print('\n')


if __name__ == '__main__':
    model = load_model('data/model/crf_model_no_pos_chunk.pkl')
    is_stop = False
    while not is_stop:
        paragraph = input('Enter a paragraph: ')
        if paragraph == 'n' or paragraph == 'N':
            is_stop = True
        else:
            sentences = get_sentences(paragraph)
            tags = model.predict(sentences)
            print_result(sentences, tags)
Esempio n. 22
0
        crf_loss_reduction=args['crf_loss_reduction'],
        using_pos_chunk=args['using_pos_chunk']
    )
    model = model.to(device)
    model.eval()

    print('program is running.....')
    is_stop = False
    while not is_stop:
        paragraph = input('Enter a paragraph: ')
        if paragraph == 'n' or paragraph == 'N':
            is_stop = True
        elif paragraph == '':
            continue
        else:
            text_sentences = utils.get_sentences(paragraph)
            sentences = [Sentence(s, word_vocab=voc) for s in text_sentences]

            ds = dataset.Dataset(sentences, word_padding_idx=voc.padding_index,
                                 pos_padding_idx=const.POS_PADDING_IDX,
                                 chunk_padding_idx=const.CHUNK_PADDING_IDX,
                                 character_padding_idx=const.CHARACTER2INDEX['<PAD>'],
                                 tag_padding_idx=const.CHUNK_PADDING_IDX)
            dl = dataset.DataLoader(ds, batch_size=len(sentences))

            for ((batch_sentence_word_indexes,
                  batch_sentence_pos_indexes,
                  batch_sentence_chunk_indexes,
                  batch_sentence_word_character_indexes),
                 batch_sentence_tag_indexes,
                 batch_sentence_lengths,
Esempio n. 23
0
import tensorflow as tf
tf_config = tf.ConfigProto()
tf_config.gpu_options.allow_growth = True
sess = tf.Session(config=tf_config)

import numpy as np
import utils

from stochastic_wae import StochasticWAEModel
from sklearn.model_selection import train_test_split

np.random.seed(1337)

snli_data = utils.get_sentences(
    file_path=
    '/Users/emielzyde/Downloads/probabilistic_nlg/snli/data/noisy_text.txt')

print('[INFO] Number of sentences = {}'.format(len(snli_data)))

sentences = [s.strip() for s in snli_data]

np.random.shuffle(sentences)

print('[INFO] Tokenizing input and output sequences')
filters = '!"#$%&()*+/:;<=>@[\\]^`{|}~\t\n'
x, word_index = utils.tokenize_sequence(sentences, filters,
                                        config['num_tokens'],
                                        config['vocab_size'])

print('[INFO] Split data into train-validation-test sets')
Esempio n. 24
0
from tensorflow.contrib.rnn import DropoutWrapper
import utils

DATA_PATH = '../retokenized_corpus.txt'
# FEATURE_NUM = 64
BATCH_SIZE = 128
EMBEDDING_SIZE = unit_num = 300  # 默认词向量的大小等于RNN(每个time step) 和 CNN(列) 中神经单元的个数, 为了避免混淆model中全部用unit_num表示。
# MAX_SEQUENCE_SIZE = time_step = 100      # 每个句子的最大长度和time_step一样,为了避免混淆model中全部用time_step表示。
DROPOUT_RATE = None
EPOCH = 60000

embeddings = utils.load_word2vec_embedding()
word_to_id_table, id_to_word_table, tag_to_id_table, id_to_tag_table = utils.build_word_tag_tables(
)
all_sentences, all_tags = \
    utils.get_sentences(word_to_id_table, tag_to_id_table)
group = utils.group_by_sentences_padding(all_sentences, all_tags)

TAGS_NUM = len(tag_to_id_table)


class NER_net:
    def __init__(self, scope_name, batch_size):
        self.batch_size = batch_size
        with tf.variable_scope(scope_name) as scope:
            self._build_net()

    def _build_net(self):
        self.time_step = tf.placeholder(tf.int32, 1)
        self.x = tf.placeholder(tf.float32,
                                [self.batch_size, self.time_step, unit_num])
Esempio n. 25
0
    def simall(self, doc):
        scores = []
        for index in range(self.D):
            score = self.sim(doc, index)
            scores.append(score)
        return scores


if __name__ == "__main__":
    text = """
    自然语言处理是计算机科学领域与人工智能领域中的一个重要方向。
    它研究能实现人与计算机之间用自然语言进行有效通信的各种理论和方法。
    自然语言处理是一门融语言学、计算机科学、数学于一体的科学。
    因此,这一领域的研究将涉及自然语言,即人们日常使用的语言,
    所以它与语言学的研究有着密切的联系,但又有重要的区别。
    自然语言处理并不是一般地研究自然语言,
    而在于研制能有效地实现自然语言通信的计算机系统,
    特别是其中的软件系统。因而它是计算机科学的一部分。
    """
    sents = utils.get_sentences(text)
    doc = []
    for sent in sents:
        words = list(jieba.cut(sent))
        words = utils.filter_stop(words)
        doc.append(words)
    print(doc)
    s = BM25(doc)
    print(s.f)
    print(s.idf)
    print(s.simall(['自然语言', '计算机科学', '领域', '人工智能', '领域']))
Esempio n. 26
0
os.environ["CUDA_VISIBLE_DEVICES"] = config['device']

import tensorflow as tf
tf_config = tf.ConfigProto()
tf_config.gpu_options.allow_growth = True
sess = tf.Session(config=tf_config)

import numpy as np
import utils

from det_wae import DetWAEModel
from sklearn.model_selection import train_test_split

np.random.seed(1337)

snli_data = utils.get_sentences(file_path = config['data'])

print('[INFO] Number of sentences = {}'.format(len(snli_data)))

sentences = [s.strip() for s in snli_data]

np.random.shuffle(sentences)

print('[INFO] Tokenizing input and output sequences')
filters = '!"#$%&()*+/:;<=>@[\\]^`{|}~\t\n'
x, word_index = utils.tokenize_sequence(sentences,
                                             filters,
                                             config['num_tokens'],
                                             config['vocab_size'])

print('[INFO] Split data into train-validation-test sets')
Esempio n. 27
0
from tensorflow.contrib.rnn import DropoutWrapper
import utils


DATA_PATH = '../retokenized_corpus.txt'
# FEATURE_NUM = 64
BATCH_SIZE = 128
EMBEDDING_SIZE = unit_num = 300         # 默认词向量的大小等于RNN(每个time step) 和 CNN(列) 中神经单元的个数, 为了避免混淆model中全部用unit_num表示。
MAX_SEQUENCE_SIZE = time_step = 100      # 每个句子的最大长度和time_step一样,为了避免混淆model中全部用time_step表示。
DROPOUT_RATE = None
EPOCH = 20000

embeddings = utils.load_word2vec_embedding()
word_to_id_table, id_to_word_table, tag_to_id_table, id_to_tag_table = utils.build_word_tag_tables()
all_sentences, all_tags = \
    utils.get_sentences(word_to_id_table, tag_to_id_table, max_sequence=MAX_SEQUENCE_SIZE)

TAGS_NUM = len(tag_to_id_table)


class NER_net:
    def __init__(self, scope_name, batch_size):
        self.batch_size = batch_size
        with tf.variable_scope(scope_name) as scope:
            self._build_net()

    def _build_net(self):
        self.x = tf.placeholder(tf.float32, [None, time_step, unit_num])
        self.y = tf.placeholder(tf.int32, [None, time_step])
        seq_x = tf.reshape(self.x, [-1, time_step * unit_num])
        seq_x = tf.split(seq_x, time_step, axis=1)
Esempio n. 28
0
import jieba
# from bm25 import BM25
from textrank import TextRank
import utils
from snownlp import seg
from sys import argv

fact = argv[1]
# fact = '公诉机关指控:2016年3月28日20时许,被告人颜某在本市洪山区马湖新村足球场马路边捡拾到被害人谢某的VIVOX5手机一部,' \
#       '并在同年3月28日2、1时起,分多次通过支付宝小额免密支付功能,秘密盗走被害人谢某支付宝内人民币3723元。案发后,被告人颜某家属已赔偿被害人全部损失,' \
#       '并取得谅解。公诉机关认为被告人颜某具有退赃、取得谅解、自愿认罪等处罚情节,建议判处被告人颜某一年以下××、××或者××,并处罚金。'
if __name__ == '__main__':

    sents = utils.get_sentences(fact)
    doc = []
    for sent in sents:
        words = seg.seg(sent)
        # words = list(jieba.cut(sent))
        words = utils.filter_stop(words)
        doc.append(words)
    # print(doc)
    # s = BM25(doc)
    # print(s.f)
    # print(s.df)
    # print(s.idf)

    rank = TextRank(doc)
    rank.text_rank()
    for index in rank.top_index(3):
        print(sents[index])
Esempio n. 29
0
def get_best_wordnet_sent(question, story, use_sch=True):
    # qbow = get_bow(question['text'])
    #initialize stemmer
    stemmer = PorterStemmer()
    best_sent = None
    best_score = 0.0

    #get right version of text to pull best sentence out of
    if (isinstance(story["sch"], str) and use_sch == True):
        sentences = story["sch"]
        print("using sch")
        # print(sentences)
    else:
        sentences = story["text"]
        print("using text")
    sentences = nltk.sent_tokenize(sentences)

    #first check qwords against wordnet words
    qwords = nltk.word_tokenize(question['text'])
    qwords = get_bow(get_sentences(question['text'])[0], stopwords)
    # print("better qbow:")
    # print(better_bow(question))

    i = 0
    for sent in wn_story_dict[question['sid']]:
        sent_score = 0.0
        # did_match = False
        words_not_found = set(qwords)

        for qword in qwords:
            for word in sent:
                if str(qword) == word:
                    # print('matched ' + qword + ' with ' + word)
                    sent_score += 1
                    # print('sent ' + str(i) + ' score: ' + str(sent_score))
                    # print('sent ' + str(i) + ': ' + sentences[i])
                    words_not_found.remove(qword)
                    # break

        print("words not found: " + str(words_not_found))
        # if words not in wordnet data, try factoring in word similarity a bit

        for qword in words_not_found:
            highest_sim = 0
            for word in sent:
                if word in model.vocab and qword in model.vocab:
                    sim = model.similarity(word, str(qword))
                    # print("sim of '" + word + "' and '" + qword + "' = " + str(sim))
                    if sim > highest_sim:
                        highest_sim = sim
                        # print("sim of '" + word + "' and '" + qword + "' = " + str(sim))
                        # print('sent ' + str(i) + ' score: ' + str(sent_score))
                        # print('sent ' + str(i) + ': ' + sentences[i])

            if highest_sim > 0.3:
                sent_score += highest_sim

        if sent_score > best_score:
            best_score = sent_score
            best_sent = sentences[i]
        print(sent)
        i += 1

    #check if we're using default_answer if so, use full text instead of scherazade
    if best_sent == None and use_sch == True:
        return get_best_wordnet_sent(question, story, False)
    return best_sent
Esempio n. 30
0
def get_first_summaries(text, stopwords, model):
    """

    :param text: 文档
    :param stopwords: 停用词
    :param model: 词向量模型
    :return: 摘要列表  按照权重从大到小排列[(句子,权重),(句子,权重)]
    """
    #获取(位置,句子)列表
    sentences = utils.get_sentences(text)

    #获取句子列表
    sen_lis = [x[1] for x in sentences]
    # print(sen_lis)
    #获取文档向量
    docvec = generate_vector.doc_vector(text, stopwords, model)

    #获取句子向量列表
    sen_vecs = []
    for i in range(len(sen_lis)):
        #假设是首句
        if i == 0:
            sen_vecs.append(
                generate_vector.sentence_vector(sen_lis[i], stopwords, model) *
                GlobalParameters.locFirst_weight)
        #如果是最后一句
        elif i == len(sen_lis) - 1:
            sen_vecs.append(
                generate_vector.sentence_vector(sen_lis[i], stopwords, model) *
                GlobalParameters.locLast_weight)
        #如果是中间的句子
        else:
            sen_vecs.append(
                generate_vector.sentence_vector(sen_lis[i], stopwords, model))

    #计算余弦值列表
    cos_lis = [utils.cos_dist(docvec, x) for x in sen_vecs]

    #计算关键词权重列表
    #获取关键词
    keywords = utils.get_keywords(text)

    #计算权重
    keyweights = [utils.keyword_weight(x, keywords) for x in sen_lis]

    #计算长度权重
    len_weigths = [utils.len_weight(x) for x in sen_lis]

    #根据余弦相似度 关键词权重 长度权重 计算每个句子最终权重
    final_weights = [
        cos * keyword * length for cos in cos_lis for keyword in keyweights
        for length in len_weigths
    ]

    #形成最后的(句子,权重列表)
    final_lis = []
    for sen, weight in zip(sen_lis, final_weights):
        final_lis.append((sen, weight))

    #将句子按照权重大小 从高到低排序
    final_lis = sorted(final_lis, key=lambda x: x[1], reverse=True)

    #取出第一次摘要的橘子个数
    final_lis = final_lis[:GlobalParameters.first_num]

    return final_lis
Esempio n. 31
0
import tensorflow as tf
tf_config = tf.ConfigProto()
tf_config.gpu_options.allow_growth = True
sess = tf.Session(config=tf_config)

import numpy as np
import utils

from det_wae import DetWAEModel
from sklearn.model_selection import train_test_split

np.random.seed(1337)

if __name__ == '__main__':
    combined_data = utils.get_sentences(file_path = '/Users/emielzyde/Downloads/noisy_data.txt')
    input_data = utils.get_sentences(file_path = '/Users/emielzyde/Downloads/clean_data.txt')
    output_data = utils.get_sentences(file_path = '/Users/emielzyde/Downloads/noisy_data.txt')

    labels = []
    label_path = '/Users/emielzyde/Downloads/labels.txt'
    with open(label_path, 'r') as f:
        label_data = f.readlines()
    for item in label_data:
        item = item.rstrip()
        labels.append(int(item))

    print('[INFO] Number of sentences = {}'.format(len(combined_data)))

    combined_sentences = [s.strip() for s in combined_data]
    input_sentences = [s.strip() for s in input_data]