def TrainToBags(df, vocab, test=False, max_length=300): dictionary = {word: idx for idx, word in enumerate(vocab)} bags = list() print("Cleaning data ...") with tqdm(total=df.shape[0]) as counter: for idx, row in df.iterrows(): words = [ tokenizer.TreebankWordTokenizer().tokenize(sent) for sent in sent_tokenize(row["text"]) ] bag, sentences, lengths = bag_to_ids(dictionary, words, max_length) if test: bags.append({ "article": bag, "lengths": lengths, "sent_lengths": sentences }) else: bags.append({ "article": bag, "lengths": lengths, "labels": row["labels"], "target_label": row["target"], "action_label": row["action"], "sent_lengths": sentences }) counter.update(1) return bags
def get_vocabs(df): data = [tokenizer.TreebankWordTokenizer().tokenize(sent) for sent in df] dictionary = Counter([word.lower() for sent in data for word in sent]) words, counts = zip( *sorted(dictionary.items(), key=operator.itemgetter(1), reverse=True)) vocab = list(words[:10000]) + ["<unk>", "<pad>"] print("vocab size:", len(vocab)) return vocab
def tokenizefile(fname): tbt = tk.TreebankWordTokenizer() f = open(fname) f.readline() f.readline() raw = f.read(500000) raw = re.sub('[0-9]', '#', raw) raw = re.sub('#+', '#', raw) raw = re.sub('[~%<>^&*()+={}/\\\]', ' ', raw) raw = re.sub('-+', '-', raw) f.close() return tbt.tokenize(raw)
def __init__(self): self.stemmer = SnowballStemmer('english', ignore_stopwords=True) self.tkn = tokenize.TreebankWordTokenizer() self.fit_text_title = False self.text_title_count_v = CountVectorizer(stop_words='english', min_df=50, tokenizer=self.my_tokenizer) self.white_space_tkn = tokenize.WhitespaceTokenizer().tokenize self.source_count_v = CountVectorizer(lowercase=False, tokenizer=self.white_space_tkn) self.person_count_v = CountVectorizer(lowercase=False, tokenizer=self.white_space_tkn)
def get_words(self): """ Returns a list of all words found in the text. """ word_tokenizer = tokenize.TreebankWordTokenizer() words = [ w.strip().lower() for w in word_tokenizer.tokenize(self.text) if w.strip() ] # Remove punctuation from words: # Ex.: <<This is the final.>> becomes # ['<','<', 'This', 'is', 'the', 'final', '.', '>', '>'] -> ['This', 'is', 'the', 'final'] words = [re.sub("\W", '', word) for word in words] words = [word for word in words if word] return words
def main(): train_path = "SemEval2010_task8_training/TRAIN_FILE.TXT" train_file = open(data_folder + train_path, "r").readlines() params = json.load(open("params.json")) train_sent, name_labels, dir_labels, e1, e2, p1, p2 = list(), list(), list( ), list(), list(), list(), list() #processes the train set for i in range(8000): start = i * 4 line = train_file[start] line = line.split("\t")[1].replace('"', "").replace("\n", "").replace( "<", "").replace(">", "").replace("'", "").lower() tokens = tokenizer.TreebankWordTokenizer().tokenize(line) # gets the position of e1 and e2 words e1_clause, e2_clause, e1_s, e1_e, e2_s, e2_e = get_e1_e2(tokens) e1.append(e1_clause) e2.append(e2_clause) train_sent.append(tokens) # rel shows the relation and dir shows its direction rel = train_file[start + 1].replace("\n", "") if rel == "Other": name_labels.append("Other") dir_labels.append(1) else: name_labels.append(rel[:rel.find("(")]) if rel[-3:-1] == "e2": dir_labels.append(0) else: dir_labels.append(1) # positional vectors for the sentences regarding e1 and e2 pos_1, pos_2 = get_pos(tokens, e1_s, e1_e, e2_s, e2_e) p1.append(pos_1) p2.append(pos_2) test_file = open(data_folder + "SemEval2010_task8_testing/TEST_FILE.txt", "r") test_sent, test_name_labels, test_dir_labels, test_e1, test_e2, test_p1, test_p2 = list( ), list(), list(), list(), list(), list(), list() # process the test data for l in test_file.readlines(): line = l.split("\t")[1].replace('"', "").replace("\n", "").replace( "<", "").replace(">", "").replace("'", "").lower() tokens = tokenizer.TreebankWordTokenizer().tokenize(line) test_e1_clause, test_e2_clause, test_e1_s, test_e1_e, test_e2_s, test_e2_e = get_e1_e2( tokens) test_e1.append(test_e1_clause) test_e2.append(test_e2_clause) test_sent.append(tokens) test_pos_1, test_pos_2 = get_pos(tokens, test_e1_s, test_e1_e, test_e2_s, test_e2_e) test_p1.append(test_pos_1) test_p2.append(test_pos_2) # gets all the vocabs in train set vocabs, chars, params["max_char"], params["max_len"] = get_vocabs_chars( train_sent) print("There are", len(vocabs), "words in the dataset") # train and test sentences are converted to their word IDs train_sent = words_to_id(vocabs, train_sent) test_sent = words_to_id(vocabs, test_sent) print("Converting tags to numbers") # relations are converted to their IDs name_labels, tag_dict = tags_to_id(name_labels, set(name_labels)) labels = [(name, dir) for name, dir in zip(name_labels, dir_labels)] params["n_outputs"] = len(tag_dict.values()) tag_dict = {idx: tag for tag, idx in tag_dict.items()} # reads the word embedding file if params["pretrain"]: print("Loading GloVe pretrained vectors") embeddings = read_embedding(vocabs, 'embeddings/glove.300.txt') else: embeddings = None X = np.array(train_sent) y = np.array(labels) # split train to train and dev X_train, X_dev, y_train, y_dev, indices_train, indices_dev = train_test_split( X, y, range(len(train_sent)), test_size=0.2, random_state=33) json.dump(params, open("params.json", "w")) true_dev_labels = [true_label(idx[0], idx[1], tag_dict) for idx in y_dev] train_batches = get_batches(X_train, y_train, vocabs.index("<pad>")) dev_batches = get_batches(X_dev, y_dev, vocabs.index("<pad>")) p1_train = just_batch([p1[idx] for idx in indices_train], pad_idx=1000) p2_train = just_batch([p2[idx] for idx in indices_train], pad_idx=1000) p1_dev = just_batch([p1[idx] for idx in indices_dev], pad_idx=1000) p2_dev = just_batch([p2[idx] for idx in indices_dev], pad_idx=1000) p1_test = just_batch(test_p1, pad_idx=1000) p2_test = just_batch(test_p2, pad_idx=1000) test_batches = get_batches(test_sent, pad_idx=vocabs.index("<pad>")) data = { "vocab": vocabs, "tag_dict": tag_dict, "embeddings": embeddings, "train_batches": train_batches, "dev_batches": dev_batches, "true_dev_labels": true_dev_labels, "p1_train": p1_train, "p1_dev": p1_dev, "p2_train": p2_train, "p2_dev": p2_dev, "p1_test": p1_test, "p2_test": p2_test, "test_batches": test_batches } pickle.dump(data, open("data.pkl", "wb"))
from nltk import tokenize from collections import Counter from nltk import ngrams, FreqDist from collections import Counter import pprint from nltk.corpus import stopwords import nltk word_tokenizer = tokenize.TreebankWordTokenizer() counter = Counter() orig_stdout = sys.stdout word_tokenizer = tokenize.TreebankWordTokenizer() counter = Counter() stopWords = set(stopwords.words('english')) filename = 'all_speeches.txt' def text_opener(rawtext): content = open(rawtext).read() sentence_list = tokenize.sent_tokenize(content) separate_words = word_tokenizer.tokenize_sents(sentence_list) words = [word for sentence in separate_words for word in sentence] words = [ word for sentence in separate_words for word in sentence if word not in stopWords ] return words
from gensim.models import TfidfModel from sklearn.preprocessing import OneHotEncoder, LabelEncoder import copy, inspect from scipy.spatial.distance import cosine stem = SnowballStemmer("english").stem link_re = re.compile(r"(http(s)?[^\s]*)|(pic\.[s]*)") hashtag_re = re.compile(r"#[a-zA-Z0-9_]+") mention_re = re.compile(r"@[a-zA-Z0-9_]+") pat_type = {'links': link_re, 'hashtags': hashtag_re, 'mentions': mention_re} tokenizers = {'treebank': nltk_token.TreebankWordTokenizer().tokenize, 'wordpunct': nltk_token.WordPunctTokenizer().tokenize, 'tweettokenize': nltk_token.TweetTokenizer().tokenize} def read_file(path): if not os.path.exists(path): raise ValueError("Path does not point to existing file: {}".format(path)) return ending = path.split('.')[-1] if ending == 'csv': return pd.read_csv(path) elif ending == 'tsv': return pd.read_csv(path, delimiter='\t') elif ending == 'pkl': return pd.read_pickle(path) elif ending == 'json':