Beispiel #1
0
def match_questions_tfidf(questions, clusters):
    # clusters = {
    #     "questions": [ 'and the moon too', 'lets show some' ],
    #     "clusterIds": [ 4, 4 ]
    # }

    # questions = [ {
    #         "id": 11, "question": 'Another one about the sun?'
    #     },
    #     {
    #         "id": 33,
    #         "question": 'What is the distance from the sun though?' },
    #     {
    #         "id": 37,
    #         "question": 'what\'s the changing factors of the sun and moon together?'
    # } ]

    # Clean and Lemmatize all the questions in the clusters
    for idx, question in enumerate(clusters["questions"]):
        question = clean_text(question.replace("\n", ""))
        clusters["questions"][idx] = " ".join([d.lemma_ for d in nlp(question)])

    # Clean and Lemmatize all the question in the orphan group
    for idx, question in enumerate(questions):
        question = clean_text(questions[idx]["question"].replace("\n", ""))
        questions[idx]["question"] = " ".join([d.lemma_ for d in nlp(question)])

    # Create corpus
    completeCorpus = clusters["questions"] + [q["question"] for q in questions]
    # completeCorpus = " ".join(clusters["questions"])
    # completeCorpus += " ".join([q["question"] for q in questions])

    vectorizer = TfidfVectorizer(
        analyzer='char',
        ngram_range=(1, 3),
        use_idf=True,
        sublinear_tf=True,
        smooth_idf=True,
        stop_words='english'
    )
    vectors = vectorizer.fit(completeCorpus)

    # add tfidf vectors to the clusters object for each question
    clusters["question_vectors"] = vectorizer.transform(clusters["questions"]).toarray()
    # for idx, question in enumerate(clusters["questions"]):
    #     clusters["question_vectors"].append(vectorizer.transform(question).toarray())

    #
    for idx, question in enumerate(questions):
        question = questions[idx]["question"]
        questions[idx]["question_vector"] = vectorizer.transform([question]).toarray()[0]

    # print("\n Clusters: ", clusters)
    # print("\n Questions: ", questions)
    # print("\n Complete Corpus: ", completeCorpus)

    cluster.findBestFitCluster(questions, clusters)

    return True
Beispiel #2
0
    def run_pipeline(self):
        os.chdir(self.loc)
        
        print("normalize and tokenize file")
        normalized_file_name = self.file_name.split('.')[0] + '.norm.tok.txt'
        normalize_file(self.lang, self.file_name, normalized_file_name)

        print("removing punctuation with single space and removing lines with foreign characters")
        clean_file_name = normalized_file_name.replace('.txt', '.clean.txt')
        clean_text(normalized_file_name, clean_file_name, self.dict_path)

        print("removing duplicate lines")
        unique_file_name = clean_file_name.replace('.txt', '.unique.txt')
        remove_duplicate(clean_file_name, unique_file_name)

        print('completed')
Beispiel #3
0
def iterateWordClouds():
    cloudfolder = "main_img/WordClouds/"
    sheets = ['Companies', 'Founders', 'VC', 'Events', 'Institutions']
    wordcloud_columns = [
        'Description', 'About', "Detailed_Description", "ShortDescription"
    ]

    # Get Text Columns's WordClouds
    for sheet in sheets:  # sheet  = "Founders"
        d = pd.read_excel(io=path + tables, sheet_name=sheet)
        for column in wordcloud_columns:  # column = "Description"
            print("%s: %s" % (sheet, column))
            if column in d.columns:  #
                # Filter those without description
                filename = "%s%s %s.png" % (cloudfolder, sheet, column)
                d[column] = d[column].apply(lambda x: " ".join(clean_text(x))
                                            if type(x) is str else None)
                text = d[~d[column].isnull()][column]
                text = " ".join(text.values)
                if (sheet == "Companies"):
                    text = text.replace("product",
                                        "").replace("servic",
                                                    "").replace("compani", "")
                elif (sheet == "Founders"):
                    text = text.replace("founder", "").replace("cofound", "")
                GetWordCloud(text, filename, width=890, height=420)
Beispiel #4
0
def _clean_texts(input_text, file_format, twitter):
    delimiter = DELIMITER[file_format]
    joiner = JOINER[file_format]
    return joiner.join([
        clean_text(text=text, twitter=twitter)
        for text in input_text.split(delimiter)
    ])
Beispiel #5
0
    def predict(self, x):
        predicts = []

        for test_str in x:
            test_str = "<s> " + test_str + " </s>"

            a = clean_text.clean_text(test_str)
            #print(a)
            a = a.split(' ')


            x_seq = numpy.zeros((len(a), self.n_in), dtype='float64')
            y_seq = numpy.zeros((len(a),), dtype='int32')

            for i in range(len(a)):
                word = a[i]

                if word in self.word2label:
                    y_seq[i] = self.word2label[word]
                else:
                    y_seq[i] = self.word2label["%%%"]

                if word in self.model:
                    x_seq[i, :] = self.model[word]
                else:
                    x_seq[i, :] = self.model["xxxxx"]

            #print(y_seq)

            predicts.append(self.f_pred(*[x_seq, y_seq]))

        return predicts
 def get(self, model, text):
     if model.lower() == "use_large":
         text = clean_text(text)
         vecs = np.array(use_large([text]))
         return {"use_large": vecs[0].tolist()}
     else:
         return {"error": "unknown model"}
Beispiel #7
0
def calc_flexibility_and_elaboration_multi_target(responses, target_words,
                                                  nlp):
    """"""
    data = pd.DataFrame({'response': responses, 'target_word': target_words})
    data['clean_response'] = [
        clean_text(response, nlp) for response in data.response
    ]
    data['elaboration'] = data.apply(
        lambda row: len(row.clean_response.split())
        if row.clean_response is not None else None,
        axis=1)
    # to control for effects of response length (elaboration) on semantic similarity, calculate similarity expected by
    # chance for all given response lengths to subtract from response similarity
    # (Forthmann et al, 2018 https://doi.org/10.1002/jocb.240)
    bootstrapped_sims = {}
    for target in data.target_word.unique():
        word_counts = data.elaboration.loc[data.target_word == target].unique()
        bootstrapped_sims[target] = bootstrap_similarity(word_counts, target)

    data['raw_similarity'] = data.apply(
        lambda row: calc_similarity(row.clean_response, row.target_word, nlp),
        axis=1)
    data['corrected_similarity'] = data.apply(
        lambda row: row.raw_similarity - bootstrapped_sims[row.target_word][
            row.elaboration]
        if not np.isnan(row.elaboration) else row.raw_similarity,
        axis=1)
    # flexibility is dissimilarity score, so invert the similarity score to get flexibility
    data['flexibility'] = (1 - abs(data['corrected_similarity']))
    return data[['clean_response', 'elaboration', 'flexibility']]
Beispiel #8
0
def match_questions_with_categories(questions, clusters):
    """A simple matching algorithm that places questions into a pre-created cluster if:
        1. The question's lemmatized form contains the cluster's keyword
        2. The question contains no rarer English words that are also cluster keywords

        Parameters:
            questions (list[dict]): A list of dictionaries with an id and question (text) field
            clusters (list[string]): A list of pre-created keywords
    """
    cluster_additions = { "uncategorized": [] }
    for question in questions:
        clean_question = clean_text(question["question"].replace("\n", ""))
        cluster_options = set()
        for token in nlp(clean_question):
            if token.lemma_ in clusters:
                cluster_options.add(token.lemma_)
        if len(cluster_options) == 0:
            cluster_additions["uncategorized"].append(question["id"])
            continue
        best_keyword = None
        rarest_freq = 1
        for keyword in cluster_options:
            if word_frequency(keyword, "en") < rarest_freq:
                rarest_freq = word_frequency(keyword, "en")
                best_keyword = keyword
        if best_keyword in cluster_additions:
            cluster_additions[best_keyword].append(question["id"])
        else:
            cluster_additions[best_keyword] = [question["id"]]
    return cluster_additions
Beispiel #9
0
def calc_flexibility_and_elaboration(responses, target_word, nlp):
    """Calculate flexibility (spacy similarity corrected for chance similarity) and elaboration (number of words).

    Arguments
    ---------
        responses: list
        target_word: string
        nlp: Spacy model

    Returns
    -------
        pandas dataframe with 3 columns: clean_response, elaboration, flexibility
    """
    data = pd.DataFrame({'clean_response': clean_text(response, nlp)}
                        for response in responses)
    data['elaboration'] = data.apply(
        lambda row: int(len(row.clean_response.split()))
        if row.clean_response is not None else None,
        axis=1)
    # to control for effects of response length (elaboration) on semantic similarity, calculate similarity expected by
    # chance for all given response lengths to subtract from response similarity
    # (Forthmann et al, 2018 https://doi.org/10.1002/jocb.240)
    word_counts = data.elaboration.unique()
    bootstrapped_sims = bootstrap_similarity(word_counts, target_word)

    data['raw_similarity'] = data.apply(
        lambda row: calc_similarity(row.clean_response, target_word, nlp),
        axis=1)
    data['corrected_similarity'] = data.apply(
        lambda row: row.raw_similarity - bootstrapped_sims[row.elaboration]
        if not np.isnan(row.elaboration) else row.raw_similarity,
        axis=1)
    # flexibility is dissimilarity score, so invert the similarity score to get flexibility
    data['flexibility'] = (1 - abs(data['corrected_similarity']))
    return data[['clean_response', 'elaboration', 'flexibility']]
def clean_data(remove_punctuation_marks, remove_stopwords):
    start = time.time()
    data_path = get_path(remove_punctuation_marks, remove_stopwords)
    for year in range(2018, 2019):
        data = read_data(year)
        entries = 0

        buffers = []
        mw_buffer = []
        f = open(data_path + str(year) + '.csv', 'a', errors='ignore')
        prev = start
        print('cleaning ', year)
        for i in range(0, len(data)):
            orig_text = ' '.join(data[i][2:]).replace('"', '').replace("'", '')
            if (orig_text.strip() != ''):
                clean_entry, _ = clean_text(
                    orig_text,
                    clean_only=True,
                    remove_stopwords=remove_stopwords,
                    remove_punctuation_marks=remove_punctuation_marks)
                print(i, "/", entries, " took: ",
                      time.time() - prev, " len tokens: ", len(clean_entry))
                prev = time.time()
                if (len(clean_entry) > 1):
                    entries += 1
                    buffers.append(clean_entry)
                    if ((len(buffers) + 1) % 3 == 0):
                        np.savetxt(f, buffers, fmt='%s', delimiter=",")
                        buffers = []
            else:
                print(orig_text)
        f.close()
        end = time.time()
        print('took overall: ', end - start)
    return entries
Beispiel #11
0
    def predict(self, x):
        predicts = []

        for test_str in x:
            test_str = "<s> " + test_str + " </s>"

            a = clean_text.clean_text(test_str)
            print(a)
            a = a.split(' ')

            x_seq = numpy.zeros((len(a), self.n_in), dtype='float64')
            y_seq = numpy.zeros((len(a),), dtype='int32')

            for i in range(len(a)):
                word = a[i]
                if word in self.word2label:
                    y_seq[i] = self.word2label[word]
                else:
                    y_seq[i] = self.word2label["%%%"]

                if word in self.model:
                    x_seq[i, :] = self.model[word]
                else:
                    x_seq[i, :] = self.model["xxxxx"]

            predicts.append(self.f_pred(*[x_seq, y_seq]))

        return predicts
def get_cleaned_subject(tarpath, header):
    email = eh.read_email(tarpath)
    subject = eh.extract_header(email, header)
    if len(subject) != 0:
        cleaned = ct.clean_text(subject[0])
        return cleaned
    if len(subject) == 0:
        return []
def get_current_normalized_skill(skill, punc_marks, stopwords, multiword):
    norm = " ".join(
        clean_text(skill,
                   remove_stopwords=not stopwords,
                   remove_punctuation_marks=not punc_marks)[0])
    if (multiword and len(norm.split()) > 1):
        norm, _ = get_multiword_tokens(norm.split())
        return norm
    else:
        return norm.split()
def tokenize_space(sentence):
    '''Return cleaned and tokenized sentences

    Example:
        >>> s = '1987 본 문 대통령.."그런다고 바뀌나? 함께 하면 바뀐다"'
        >>> tokenize_clean_text(s)
        ['1987', '본', '문', '대통령', '그런다고', '바뀌나', '함께', '하면', '바뀐다']
    '''
    sentence = clean_text(sentence)
    sent_tokened = [i for i in sentence.split(' ') if len(i) > 0]
    return sent_tokened
def clean_the_data(dataframe):

"""
INPUT: a dataframe with unparsed text in the description field
OUTPUT: a dataframe with cleaned text in the description field
"""

	for i in dataframe.index:
		doc = ct.clean_text(dataframe.description.ix[i])
		dataframe.description.ix[i] = doc
	return dataframe
def clean_training_data(multiword, punc_marks, stopwords, window):
    entries = read_training_data()
    for i in range(0, len(entries)):
        print(i)
        tokens, pos_tags = clean_text(entries[i][2],
                                      remove_stopwords=not stopwords,
                                      remove_punctuation_marks=not punc_marks)
        if (multiword):
            tokens, pos_tags = get_multiword_tokens(tokens, pos_tags)
        write_clean_tokens_to_file(tokens, pos_tags, multiword, punc_marks,
                                   stopwords, window)
    print('done')
Beispiel #17
0
 def process_docs(self):
     """Process docs that have an id attribute and a question attribute."""
     self.documents = []
     if self._raw_docs and self._nlp:
         for doc in self._raw_docs:
             doc["question"] = clean_text(doc["question"].replace("\n", ""))
             if len(doc["question"]) == 0:
                 doc["question"] = " "
             processed_doc = self._nlp(doc["question"])
             processed_doc._.tag = doc["id"]
             processed_doc._.lemmatized = ' '.join(
                 [d.lemma_ for d in processed_doc])
             self.documents.append(processed_doc)
def read_classification_data(data_path,
                             remove_punctuation_marks=True,
                             remove_stopwords=True,
                             multiword=False):
    p = data_path.rpartition('.')
    clean_data_path = p[0] + '_cleaned.' + p[2]
    if (not path.exists(p[0] + '_cleaned.' + p[2])):
        print('no cleaned data found for: ' + data_path)
        f = open(data_path, 'r')
        data = list(csv.reader(f, quoting=csv.QUOTE_NONE, delimiter=';'))
        f.close()
        print('Found ', len(data), "entries.")
        f = open(clean_data_path, 'a', errors='ignore')
        prev = time.time()
        entries = 0
        buffers = []
        for i in range(0, len(data)):
            orig_text = data[i][1].replace('"', '').replace("'", '')
            clean_entry, _ = clean_text(
                orig_text,
                clean_only=True,
                remove_stopwords=remove_stopwords,
                remove_punctuation_marks=remove_punctuation_marks)
            print(i, "/", entries, " took: ",
                  time.time() - prev, " len tokens: ", len(clean_entry))
            prev = time.time()
            if (len(clean_entry) > 1):
                entries += 1
                buffers.append(clean_entry)
                if ((len(buffers) + 1) % 3 == 0):
                    np.savetxt(f, buffers, fmt='%s', delimiter=",")
                    buffers = []
        f.close()
    else:
        print('clean data found at: ' + clean_data_path)
        entries = []
        for year in range(2017, 2018):
            if (not multiword):
                f = open(clean_data_path, 'r')
                f = (l.replace('\0', '') for l in f)
                entries = entries + list(csv.reader(f, quotechar="'"))
                for i, data in enumerate(entries):
                    entries[i] = [
                        d.replace("'", '').replace("[",
                                                   "").replace("]",
                                                               "").strip()
                        for d in data
                    ]
                f.close()
        return entries
def prepare_text(type):

    print("[*] Concatenation started\n")

    if type.lower() == 'train':
        inpath = "train"

        dirs = (file for file in listdir(inpath) if isdir(join(inpath, file)))

        for directory in dirs:
            print(directory)
            concatenate_files(
                join(inpath, directory),
                join(inpath, directory, 'Result',
                     'input_' + directory.lower() + '_tmp.txt'))
            clean_text(join(inpath, directory, 'Result',
                            'input_' + directory.lower() + '_tmp.txt'),
                       join(inpath, directory, 'Result',
                            'input_' + directory.lower() + '.txt'),
                       zip_files=False)
            remove(
                join(inpath, directory, 'Result',
                     'input_' + directory.lower() + '_tmp.txt'))

    elif type.lower() == 'test':
        inpath = "test"
        filename = "test_author"

        concatenate_files(inpath, join(inpath, 'Result',
                                       filename + '_tmp.txt'))
        clean_text(join(inpath, 'Result', filename + '_tmp.txt'),
                   join(inpath, 'Result', filename + '.txt'),
                   zip_files=False)
        remove(join(inpath, 'Result', filename + '_tmp.txt'))

    print("\n[*] Concatenation ended")
Beispiel #20
0
Datei: ag.py Projekt: bit-ml/date
def export_ds(subsets, phase):
    raw_text = {
        "world" : '',
        "sports" : '',
        "business": '',
        "sci": ''
    }

    for data_label, subset in zip(subsets, raw_text):
        for text in subsets[data_label]:
            text = clean_text(text)
            raw_text[subset] += f'\n\n{text}'

    for subset in raw_text:
        dump_data(phase, subset, raw_text[subset])
Beispiel #21
0
def process_title(title, article):
    title_set = clean_text(title)

    id = frozenset_to_filename(frozenset(title_set))
    if len(id) >= 254 or len(pathname2url(id)) >= 254:
        return "error"
    else:
        try:
            title_entry = titles_dict.get(id, Title(id=id, words=title_set))
            if article.id not in set(title_entry.articles):
                title_entry.add_article(article.id)
                titles_dict[id] = title_entry

            add_title_to_words(title_entry)
            return title_entry
        except:
            return "error"
Beispiel #22
0
def get_search_result(query):
    
        """ Main Program """
        
        cleaned_query = clean_text(query)
        
        tag_name = get_tag(cleaned_query)
        # If tag name is None, the whole Title column
        # is given as input to search result to 
        # find similarity with the entire corpus
        if tag_name:           
            df = subset_df(tag_name)        
            output_var = search_result(df,cleaned_query)
        else:
            output_var = search_result(medium_cleaned,cleaned_query)
            
        return output_var
Beispiel #23
0
def export_ds(subset, groups):
    for entry in groups:
        dataset, _ = fetch_20newsgroups(data_home='./20ng_od',
                                        subset=subset,
                                        categories=entry['names'],
                                        remove=('headers', 'footers',
                                                'quotes'),
                                        return_X_y=True)
        corpus = ''
        for article in dataset:
            stripped = re.sub('\s+', ' ', article)
            stripped = clean_text(stripped)
            corpus += f'\n\n{stripped}'

        full_path = os.path.join('./20ng_od', subset)

        if not os.path.exists(full_path):
            os.makedirs(full_path)

        with open(os.path.join(full_path, f'{entry["topic"]}.txt'), 'w') as f:
            f.write(corpus)
Beispiel #24
0
def LDA(text):
    
    from gensim.models import ldamodel    
    from clean_text import clean_text
    from LDA_AuxFunc import sent_to_words, remove_stopwords, make_bigrams, lemmatize
    
    import gensim.corpora as corpora
    
    import warnings
    import logging
    
    
    logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.ERROR)
    warnings.filterwarnings("ignore", category = DeprecationWarning)

    text = clean_text(text)
    
    data_words = list(sent_to_words(text))
    
    data_words_nostops = remove_stopwords(data_words)
    
    data_words_bigrams = make_bigrams(data_words, data_words_nostops)
    
    data_lemmatized = lemmatize(data_words_bigrams, allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV'])
    
    id2word = corpora.Dictionary(data_lemmatized)
    
    corpus = [id2word.doc2bow(text) for text in data_lemmatized]
    
    lda_model = ldamodel.LdaModel(corpus = corpus,
                                  id2word = id2word,
                                  num_topics = 3, 
				  random_state = 1,
                                  update_every = 1,
                                  chunksize = 50,
                                  passes = 20,
                                  alpha = 'auto',
                                  per_word_topics = True)
    
    return (lda_model, corpus, id2word)
Beispiel #25
0
print("Read label2word ...")
with open("label2word.txt", 'r', encoding='utf-8') as file:
    for line in file:
        lines = line.strip('\n').split('\t')
        label = lines[0]
        word = lines[1]
        word2label[word] = label
        label2word[label] = word

print("Read training data ...")
#with open('train_small.txt', 'r', encoding='UTF-8') as file:
with open('training.txt', 'r', encoding='UTF-8') as file:
    for line in file:

        a = clean_text.clean_text(line)
        a = a.split(' ')

        if len(a) < 5:
            continue

        x_seq_list.append(a)
        '''
        x_seq = np.zeros((len(a), word_vec_len), dtype='float64')
        y_seq = np.zeros((len(a),), dtype='int32')

        for i in range(len(a)):
            word = a[i]

            if word in word2label:
                y_seq[i] = word2label[word]
Beispiel #26
0
# Convet "<s>" and "</s>" to "."
word2label["."] = labelCount
label2word[labelCount] = "."
labelCount += 1

# Other words are set "%%%"
word2label["%%%"] = labelCount
label2word[labelCount] = "%%%"
labelCount += 1

# Map word to labelID
with open('MLDS_Final/sentence/train_clean.set', 'r', encoding='UTF-8') as file:
    for line in file:

        a = clean_text.clean_text(line)
        a = a.split(' ')

        for i in range(len(a)):
            word = a[i]
            if word not in word2label:
                word2label[word] = labelCount
                label2word[labelCount] = word
                labelCount += 1

n_hidden = 100
n_in = word_vec_len
n_out = len(label2word)
RNN = MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out,
                activation='tanh', output_type='softmax',
                use_symbolic_softmax=True)
Beispiel #27
0
import clean_text as ct  
import parse_and_store as ps 
import word_vectors as wv 
import calculate_score as cs
import pickle
import gensim

description_model = gensim.models.Word2Vec.load('word_model')

"""
INPUT: the raw text to be analyzed
OUTPUT: a string saying whether the text is fraudulent or not
"""

if __name__ = "__main__":

	document = raw_input("enter the document to be analyzed:  ")
	clean_doc = ct.clean_text(document)
	doc_df = ps.store_and_parse(clean_doc)
	features = wv.write_word_vectors(doc_df, description_model) 
	model = pickle.load( open( 'MODEL', 'rb' ) )
	y_pred = model.predict(features)
	scoring_df = cs.scoring_df(doc_df, y_pred)
	cs.calculate_frauds(scoring_df)
 def get_cleaned_readings(self, n):
     df_raw = download_readings(n)
     df = df_raw.copy()
     for each in df_raw.columns:
         df[each] = df[each].apply(lambda x: clean_text(x))
     return df
def get_cleaned_body(tarpath):
    email = eh.read_email(tarpath)
    body = eh.extract_body(email)
    cleaned = ct.clean_text(body)
    return cleaned
Beispiel #30
0
    location, location_type = dataset_helpers.get_tweet_location(hit)
    username, verified = dataset_helpers.get_tweet_user(hit)
    tweets.append((text, username, verified, location, location_type))
    if args.numresults != -1 and i + 1 == args.numresults:
        break

num_tweets = len(tweets)
print("{0} tweets found.".format(num_tweets))
print()

#clean the text
print("Cleaning text...")
print()
if args.modeltype == "Word2Vec":
    tweets = [(clean_text(t,
                          normalize_case=True,
                          blacklist_regex="non_alpha_numeric"), u, v,
               clean_text(l), p) for t, u, v, l, p in tweets]
elif args.modeltype == "TFHub":
    tweets = [(clean_text(t), u, v, clean_text(l), p)
              for t, u, v, l, p in tweets]
else:
    raise "Unknown model type."

#filter out empty tweets and replace empty locations
tweets = [t for t in tweets if t[0] != ""]
tweets = [(t, u, v, l if l != "" else "[No location available]", p)
          for t, u, v, l, p in tweets]
empty_tweets = num_tweets - len(tweets)
if empty_tweets > 0:
    print("Removed {0} empty tweet(s).".format(empty_tweets))
repos_dir = sys.argv[1]
dest = sys.argv[2]

for repo_dir in os.listdir(repos_dir):
    full_path = os.path.join(repos_dir, repo_dir)
    if not os.path.isdir(full_path):
        continue
    dest_path = os.path.join(dest, repo_dir)
    with open(dest_path, 'w') as out_handle:
        total_read = 0
        for root, dirs, files in os.walk(full_path):
            for filename in files:
                f_full_path = os.path.join(root, filename)

                try:
                    with open(f_full_path) as handle:
                        everything = handle.read(100000)  # read 1 mb
                        everything = ' '.join(clean_text(everything))
                        total_read += len(everything.encode('utf-8'))

                        out_handle.write(everything)
                        out_handle.write(' ')

                        if total_read > 1000000:
                            break
                except:
                    pass

            if total_read > 1000000:
                break
Beispiel #32
0
print("Read label2word ...")
with open("label2word.txt", 'r', encoding='utf-8') as file:
    for line in file:
        lines = line.strip('\n').split('\t')
        label = lines[0]
        word = lines[1]
        word2label[word] = label
        label2word[label] = word

print("Read training data ...")
with open('train_small.txt', 'r', encoding='UTF-8') as file:
#with open('training.txt', 'r', encoding='UTF-8') as file:
    for line in file:

        a = clean_text.clean_text(line)
        a = a.split(' ')

        if len(a) < 5:
            continue

        x_seq = np.zeros((len(a), word_vec_len), dtype='float64')
        y_seq = np.zeros((len(a),), dtype='int32')

        for i in range(len(a)):
            word = a[i]

            if word in word2label:
                y_seq[i] = word2label[word]
            else:
                y_seq[i] = word2label["%%%"]
Beispiel #33
0
# Convet "<s>" and "</s>" to "."
word2label["."] = labelCount
label2word[labelCount] = "."
labelCount += 1

# Other words are set "%%%"
word2label["%%%"] = labelCount
label2word[labelCount] = "%%%"
labelCount += 1

# Map word to labelID
with open('MLDS_Final/sentence/train_clean.set', 'r',
          encoding='UTF-8') as file:
    for line in file:

        a = clean_text.clean_text(line)
        a = a.split(' ')

        for i in range(len(a)):
            word = a[i]
            if word not in word2label:
                word2label[word] = labelCount
                label2word[labelCount] = word
                labelCount += 1

n_hidden = 100
n_in = word_vec_len
n_out = len(label2word)
RNN = MetaRNN(n_in=n_in,
              n_hidden=n_hidden,
              n_out=n_out,