Beispiel #1
0
def TrainToBags(df, vocab, test=False, max_length=300):
    dictionary = {word: idx for idx, word in enumerate(vocab)}
    bags = list()
    print("Cleaning data ...")
    with tqdm(total=df.shape[0]) as counter:
        for idx, row in df.iterrows():
            words = [
                tokenizer.TreebankWordTokenizer().tokenize(sent)
                for sent in sent_tokenize(row["text"])
            ]
            bag, sentences, lengths = bag_to_ids(dictionary, words, max_length)
            if test:
                bags.append({
                    "article": bag,
                    "lengths": lengths,
                    "sent_lengths": sentences
                })
            else:
                bags.append({
                    "article": bag,
                    "lengths": lengths,
                    "labels": row["labels"],
                    "target_label": row["target"],
                    "action_label": row["action"],
                    "sent_lengths": sentences
                })
            counter.update(1)
    return bags
Beispiel #2
0
def get_vocabs(df):
    data = [tokenizer.TreebankWordTokenizer().tokenize(sent) for sent in df]
    dictionary = Counter([word.lower() for sent in data for word in sent])
    words, counts = zip(
        *sorted(dictionary.items(), key=operator.itemgetter(1), reverse=True))
    vocab = list(words[:10000]) + ["<unk>", "<pad>"]
    print("vocab size:", len(vocab))
    return vocab
def tokenizefile(fname):
    tbt = tk.TreebankWordTokenizer()
    f = open(fname)
    f.readline()
    f.readline()
    raw = f.read(500000)
    raw = re.sub('[0-9]', '#', raw)
    raw = re.sub('#+', '#', raw)
    raw = re.sub('[~%<>^&*()+={}/\\\]', ' ', raw)
    raw = re.sub('-+', '-', raw)
    f.close()
    return tbt.tokenize(raw)
    def __init__(self):

        self.stemmer = SnowballStemmer('english', ignore_stopwords=True)
        self.tkn = tokenize.TreebankWordTokenizer()

        self.fit_text_title = False
        self.text_title_count_v = CountVectorizer(stop_words='english',
                                                  min_df=50,
                                                  tokenizer=self.my_tokenizer)

        self.white_space_tkn = tokenize.WhitespaceTokenizer().tokenize
        self.source_count_v = CountVectorizer(lowercase=False,
                                              tokenizer=self.white_space_tkn)
        self.person_count_v = CountVectorizer(lowercase=False,
                                              tokenizer=self.white_space_tkn)
    def get_words(self):
        """
            Returns a list of all words found in the text.
        """

        word_tokenizer = tokenize.TreebankWordTokenizer()
        words = [
            w.strip().lower() for w in word_tokenizer.tokenize(self.text)
            if w.strip()
        ]

        # Remove punctuation from words:
        # Ex.:  <<This is the final.>>  becomes
        # ['<','<', 'This', 'is', 'the', 'final', '.', '>', '>'] -> ['This', 'is', 'the', 'final']
        words = [re.sub("\W", '', word) for word in words]
        words = [word for word in words if word]

        return words
def main():
    train_path = "SemEval2010_task8_training/TRAIN_FILE.TXT"
    train_file = open(data_folder + train_path, "r").readlines()
    params = json.load(open("params.json"))

    train_sent, name_labels, dir_labels, e1, e2, p1, p2 = list(), list(), list(
    ), list(), list(), list(), list()

    #processes the train set
    for i in range(8000):
        start = i * 4
        line = train_file[start]
        line = line.split("\t")[1].replace('"', "").replace("\n", "").replace(
            "<", "").replace(">", "").replace("'", "").lower()
        tokens = tokenizer.TreebankWordTokenizer().tokenize(line)

        # gets the position of e1 and e2 words
        e1_clause, e2_clause, e1_s, e1_e, e2_s, e2_e = get_e1_e2(tokens)
        e1.append(e1_clause)
        e2.append(e2_clause)
        train_sent.append(tokens)

        # rel shows the relation and dir shows its direction
        rel = train_file[start + 1].replace("\n", "")
        if rel == "Other":
            name_labels.append("Other")
            dir_labels.append(1)
        else:
            name_labels.append(rel[:rel.find("(")])
            if rel[-3:-1] == "e2":
                dir_labels.append(0)
            else:
                dir_labels.append(1)
        # positional vectors for the sentences regarding e1 and e2
        pos_1, pos_2 = get_pos(tokens, e1_s, e1_e, e2_s, e2_e)
        p1.append(pos_1)
        p2.append(pos_2)

    test_file = open(data_folder + "SemEval2010_task8_testing/TEST_FILE.txt",
                     "r")
    test_sent, test_name_labels, test_dir_labels, test_e1, test_e2, test_p1, test_p2 = list(
    ), list(), list(), list(), list(), list(), list()

    # process the test data
    for l in test_file.readlines():
        line = l.split("\t")[1].replace('"', "").replace("\n", "").replace(
            "<", "").replace(">", "").replace("'", "").lower()
        tokens = tokenizer.TreebankWordTokenizer().tokenize(line)
        test_e1_clause, test_e2_clause, test_e1_s, test_e1_e, test_e2_s, test_e2_e = get_e1_e2(
            tokens)
        test_e1.append(test_e1_clause)
        test_e2.append(test_e2_clause)
        test_sent.append(tokens)
        test_pos_1, test_pos_2 = get_pos(tokens, test_e1_s, test_e1_e,
                                         test_e2_s, test_e2_e)
        test_p1.append(test_pos_1)
        test_p2.append(test_pos_2)

    # gets all the vocabs in train set
    vocabs, chars, params["max_char"], params["max_len"] = get_vocabs_chars(
        train_sent)
    print("There are", len(vocabs), "words in the dataset")

    # train and test sentences are converted to their word IDs
    train_sent = words_to_id(vocabs, train_sent)
    test_sent = words_to_id(vocabs, test_sent)

    print("Converting tags to numbers")
    # relations are converted to their IDs
    name_labels, tag_dict = tags_to_id(name_labels, set(name_labels))
    labels = [(name, dir) for name, dir in zip(name_labels, dir_labels)]

    params["n_outputs"] = len(tag_dict.values())
    tag_dict = {idx: tag for tag, idx in tag_dict.items()}

    # reads the word embedding file
    if params["pretrain"]:
        print("Loading GloVe pretrained vectors")
        embeddings = read_embedding(vocabs, 'embeddings/glove.300.txt')
    else:
        embeddings = None

    X = np.array(train_sent)
    y = np.array(labels)

    # split train to train and dev
    X_train, X_dev, y_train, y_dev, indices_train, indices_dev = train_test_split(
        X, y, range(len(train_sent)), test_size=0.2, random_state=33)

    json.dump(params, open("params.json", "w"))
    true_dev_labels = [true_label(idx[0], idx[1], tag_dict) for idx in y_dev]
    train_batches = get_batches(X_train, y_train, vocabs.index("<pad>"))
    dev_batches = get_batches(X_dev, y_dev, vocabs.index("<pad>"))
    p1_train = just_batch([p1[idx] for idx in indices_train], pad_idx=1000)
    p2_train = just_batch([p2[idx] for idx in indices_train], pad_idx=1000)
    p1_dev = just_batch([p1[idx] for idx in indices_dev], pad_idx=1000)
    p2_dev = just_batch([p2[idx] for idx in indices_dev], pad_idx=1000)

    p1_test = just_batch(test_p1, pad_idx=1000)
    p2_test = just_batch(test_p2, pad_idx=1000)
    test_batches = get_batches(test_sent, pad_idx=vocabs.index("<pad>"))

    data = {
        "vocab": vocabs,
        "tag_dict": tag_dict,
        "embeddings": embeddings,
        "train_batches": train_batches,
        "dev_batches": dev_batches,
        "true_dev_labels": true_dev_labels,
        "p1_train": p1_train,
        "p1_dev": p1_dev,
        "p2_train": p2_train,
        "p2_dev": p2_dev,
        "p1_test": p1_test,
        "p2_test": p2_test,
        "test_batches": test_batches
    }
    pickle.dump(data, open("data.pkl", "wb"))
Beispiel #7
0
from nltk import tokenize
from collections import Counter
from nltk import ngrams, FreqDist
from collections import Counter
import pprint
from nltk.corpus import stopwords
import nltk

word_tokenizer = tokenize.TreebankWordTokenizer()
counter = Counter()

orig_stdout = sys.stdout
word_tokenizer = tokenize.TreebankWordTokenizer()
counter = Counter()
stopWords = set(stopwords.words('english'))

filename = 'all_speeches.txt'


def text_opener(rawtext):
    content = open(rawtext).read()
    sentence_list = tokenize.sent_tokenize(content)
    separate_words = word_tokenizer.tokenize_sents(sentence_list)
    words = [word for sentence in separate_words for word in sentence]
    words = [
        word for sentence in separate_words for word in sentence
        if word not in stopWords
    ]

    return words
Beispiel #8
0
from gensim.models import TfidfModel
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import copy, inspect
from scipy.spatial.distance import cosine

stem = SnowballStemmer("english").stem

link_re = re.compile(r"(http(s)?[^\s]*)|(pic\.[s]*)")
hashtag_re = re.compile(r"#[a-zA-Z0-9_]+")
mention_re = re.compile(r"@[a-zA-Z0-9_]+")

pat_type = {'links': link_re,
            'hashtags': hashtag_re,
            'mentions': mention_re}

tokenizers = {'treebank': nltk_token.TreebankWordTokenizer().tokenize,
              'wordpunct': nltk_token.WordPunctTokenizer().tokenize,
              'tweettokenize': nltk_token.TweetTokenizer().tokenize}

def read_file(path):
    if not os.path.exists(path):
        raise ValueError("Path does not point to existing file: {}".format(path))
        return
    ending = path.split('.')[-1]
    if ending == 'csv':
        return pd.read_csv(path)
    elif ending == 'tsv':
        return pd.read_csv(path, delimiter='\t')
    elif ending == 'pkl':
        return pd.read_pickle(path)
    elif ending == 'json':