Exemple #1
0
 def add_sentence(self, sentence, tokenize, sentence_no):
     words = ['<s>'] + helper.tokenize(sentence, tokenize) + ['</s>']
     self.sentence1_str = sentence
     if sentence_no == 1:
         self.sentence1 = words
     else:
         self.sentence2 = words
def worker_task(articles, args, worker_id, Dict):
	random.seed(args.seed)
	sys.stdout.write("{}: Begin processing article files...\n".format(worker_id))
	article_file_count = 0
	for article_file in articles:
		sys.stdout.write("{}: Begin processing file {}\n".format(worker_id, article_file))
		# process file, tokenize
		tokenized_sentences = {} # it's a dictionary with {index:tokenized_sentence} 
		tokenized_labels = {}
		it = 0
		with open(join(args.inputpath, article_file), "r") as F:
			for line in F:
				if line.startswith("<doc") or line.startswith("</doc>"):
					continue	
				sentences = [filter_with_alphabet(s, args.alphabet) for s in re.split(args.separator, sanitize_line(line))]
				for i in range(len(sentences)):
					tmp = tokenize(Dict, sentences[i], gram_length, token_weight)
					tokenized_sentences[it] = tmp
					tokenized_labels[it] = get_gram_label(Dict_gram_to_label, tmp)
					it += 1
		with open(join(args.outputpath + "/readable_articles", article_file + ".readable"), "w") as F:
			F.write(json.dumps(tokenized_sentences))
		with open(join(args.outputpath + "/articles", article_file), "w") as F:
			F.write(json.dumps(tokenized_labels))
		article_file_count += 1
		sys.stdout.write("{}: Finished processing {}th file {}\n".format(worker_id, article_file_count, article_file))
 def form_vocabulary(self, in_file, tokenize):
     """Creates the vocabulary."""
     assert os.path.exists(in_file)
     with open(in_file, 'r') as f:
         for line in f:
             session = json.loads(line)
             assert len(session['query']) == len(session['clicks'])
             for qidx in range(len(session['query'])):
                 query_terms = helper.tokenize(session['query'][qidx][0], tokenize)
                 query_letter_n_grams = self.get_letter_n_grams(query_terms, self.order_n_gram)
                 self.add_letter_n_grams(query_letter_n_grams)
                 for i in range(len(session['clicks'][qidx])):
                     doc_title = session['clicks'][qidx][i][1]
                     title_terms = helper.tokenize(doc_title, tokenize)
                     doc_letter_n_grams = self.get_letter_n_grams(title_terms, self.order_n_gram)
                     self.add_letter_n_grams(doc_letter_n_grams)
Exemple #4
0
 def add_sentence(self, sentence, tokenize, sentence_no, dictionary,
                  is_test_instance):
     words = ['<s>'] + helper.tokenize(sentence, tokenize) + ['</s>']
     if not is_test_instance:
         for word in words:
             dictionary.add_word(word)
     if sentence_no == 1:
         self.sentence1 = words
     else:
         self.sentence2 = words
Exemple #5
0
 def add_pos_tags(self, sentence, tokenize):
     """We assume that the raw sentence will be passed in -> this is for self-attentive network
     :param sentence: raw sentence not tokenized
     :return: void
     """
     tokenized_sent = helper.tokenize(sentence, tokenize)
     pos_tags = pos_tag(tokenized_sent)
     self.pos_tags = [tag[1] for tag in pos_tags]
     for i in range(len(self.pos_tags)):
         if self.pos_tags[i] not in pos_to_idx:
             self.pos_tags[i] = 'UNK'
 def train(self, data, category):
     self.prior_counts[category] += 1
     words = tokenize(data)
     count_dict = count_words(words)
     for word, count in count_dict.items():
         if word not in self.vocab:
             self.vocab[
                 word] = 0.0  # use 0.0 here so Python does "correct" math
         if word not in self.word_counts[category]:
             self.word_counts[category][word] = 0.0
         self.vocab[word] += count
         self.word_counts[category][word] += count
     self.update_prior_prob()
Exemple #7
0
 def add_text(self, text, tokenize, max_len):
     content_terms = helper.tokenize(text, tokenize)
     content_terms = content_terms if len(
         content_terms) <= max_len else content_terms[:max_len]
     content_terms = ['<s>'] + content_terms + ['<s>']
     content_terms = ['#' + item + '#' for item in content_terms]
     for i in range(len(content_terms)):
         # create letter-trigrams
         word = content_terms[i]
         letter_trigrams_for_words = []
         for j in range(0, len(word) - 2):
             letter_trigrams_for_words.append(word[j:j + 3])
         self.letter_trigrams.append(letter_trigrams_for_words)
def imagenet_flickr8k_intersection():
    path = "../data/Flickr8k_text/Flickr8k.token.txt"
    wordlist, worddictionary = tokenize(
        "../data/Flickr8k_text/Flickr8k.token.txt")
    worddf = dicttodf(worddictionary)
    path2 = "/Users/sebastiaanscholten/Documents/speech2image-master/vgsexperiments/experiments/data/imagenet_class_index.json"

    with open(path2, "r") as json_file:
        data = json.load(json_file)

    wordlist1 = list(worddf["words"])
    wordlist2 = []
    for words in data.values():
        wordlist2.append(words[1])

    return intersection(wordlist1, wordlist2)
    def predict(self, data):
        words = tokenize(data)
        count_dict = count_words(words)
        p_abstract_given = self.init_p_abstract_given_category()

        for w, cnt in count_dict.items():
            #if w in stopwords:
            #   continue
            p_w_given = self.calc_p_w_given_category(w)
            for category in self.categories:
                p_abstract_given[category] *= p_w_given[category]**cnt

        posterior_prob = self.calc_posterior_prob(p_abstract_given)
        predicted_category = max(posterior_prob.iteritems(),
                                 key=operator.itemgetter(1))[0]
        return predicted_category
Exemple #10
0
def write(input_string, output_path):
  with open(output_path, "w") as f:
    writer = csv.writer(f, delimiter="\t")
    tokens = helper.tokenize(input_string)
    normalizations = []
    tokens_and_normalizations = []

    for token in tokens: #normalizing and storing in list
      normalizations.append(helper.normalize(token))
    
    for i in range(len(tokens)): #creating pairs and storing
      temporary_pair = []        #them in a nested list
      temporary_pair.append(tokens[i])
      temporary_pair.append(normalizations[i])
      tokens_and_normalizations.append(temporary_pair)
    
    writer.writerows(tokens_and_normalizations)
 def add_text(self, text, tokenize, max_len):
     content_terms = helper.tokenize(text, tokenize)
     content_terms = content_terms if len(
         content_terms) <= max_len else content_terms[:max_len]
     for i in range(len(content_terms)):
         # create letter-trigrams
         word = content_terms[i]
         word_letter_n_grams = []
         for j in range(1, self.order_n_gram + 1):
             if j > len(word):
                 break
             else:
                 # create letter_n_grams where n = j
                 word_letter_n_grams.extend(self.find_letter_ngrams(
                     word, j))
         if word_letter_n_grams:
             self.letter_n_grams.append(word_letter_n_grams)
Exemple #12
0
def write(input_string, output_path):
    with open(output_path, "w", newline="") as f:
        # To write .csv and .tsv, you first open a
        # file, then you call csv.writer() and give
        # it the file as an argument as seen below.
        # (It doesn't work if you haven't imported
        # the csv module above.)
        # writer = csv.writer(f)
        writer = csv.writer(f, delimiter="\t")

        # The csv.writer() method can either just take
        # one argument, the file it will write to, in
        # which case it looks just as above. It can,
        # however, take a second argument, and look
        # as follows:
        # csv.writer(f, delimiter=",")
        # which will tell the csv module explicitly
        # that we want to use , as a symbol to separate
        # the individual fields.
        # TODO: Change the csv.writer() call above so
        # that the csv module uses the tab as a
        # delimiter. Check exercise-5-readme.md
        # if you are unsure.

        # Then, you can use writer.writerows() to write
        # your .csv file. The writerows() function
        # takes as argument a list of lists. For example,
        # calling writerows( [ [ a , b ] , [ c , d ] ])
        # will result in the following .csv file:
        # a,b
        # c,d

        # TODO: construct a list of lists in the
        # following form:
        # [ [ token1, normalised_form1 ] ,
        #   [ token2, normalised_form2 ] ,
        #  ... ]

        # using the helper.tokenize() and
        # helper.normalize() functions, then change
        # the below call to use your list of lists
        tokens = helper.tokenize(input_string)

        tokens_and_normalizations = [[token, helper.normalize(token)]
                                     for token in tokens]
        writer.writerows(tokens_and_normalizations)
def collect():
    global chunks
    i = 0
    with open("data/posts.csv", "r") as f:
        for post in csv.DictReader(f, fieldnames=["gender", "author", "body"]):
            if i % 100 == 0:
                print(i)
            i += 1
            # quick tokenize func to ensure we don't divide by 0.
            if len(helper.tokenize(post["body"])) == 0:
                continue
            x = features.get_features(post["body"])
            y = [post["gender"] == "male"]
            chunks.append(y + x)

    chunks = np.array(chunks)
    np.random.shuffle(chunks)
    np.save("data/chunks.npy", chunks)
Exemple #14
0
    def process_text(self,
                     text,
                     nummify=True,
                     add_words_to_list=True,
                     is_query=False):
        tokenized = helper.tokenize(text, self.text_processor_pipeline)

        if is_query:
            vec = []
            for word in tokenized:
                if word in self.word_to_num:
                    vec.append(self.word_to_num[word])
            return vec
        else:
            for word in tokenized:
                self.add_word(word)

            if not nummify:
                return

            return self.nummify(tokenized)
Exemple #15
0
def transform(entry, output):
    chunks = []
    try:
        i = 0
        with open(entry, "r") as f:
            for post in csv.DictReader(f,
                                       fieldnames=["gender", "author",
                                                   "body"]):
                if i % 100 == 0:
                    print(i)
                i += 1
                # quick tokenize func to ensure we don't divide by 0.
                if len(helper.tokenize(post["body"])) == 0:
                    continue
                x = features.get_features(post["body"])
                y = [post["gender"] == "male"]
                chunks.append(y + x)

        chunks = np.array(chunks)
        np.random.shuffle(chunks)
        np.save(output, chunks)
    except KeyboardInterrupt:
        timestamp = datetime.now().timestamp()
        np.save(f"{output}_{timestamp}", chunks)
Exemple #16
0
 def __init__(self, content, max_len, tokenize=False):
     content_terms = helper.tokenize(content, tokenize)
     self.text = content_terms if len(
         content_terms) <= max_len else content_terms[:max_len]
     self.is_clicked = False
Exemple #17
0
    def question(self):
        """
        Parses the user's most recent message,
        and decides what to do based on the
        content and the current status.
        """
        name = self.name
        mess = self.memory.read("message")
        m = tokenize(mess)
        mess = " ".join(m)

        if mess == None: return "reset"

        # if the user wants to exit, then
        # return True (kill the session)
        if mess.startswith("exit") or \
                "bye" in m or \
                "goodbye" in m:
            self.bot.send("Glad to be of help :)", name)
            return "exit"

        # if the user says "nevermind", then
        # clear the session
        if mess.find("nevermind") != -1:
            self.bot.send("Ok.", name)
            self.clear()
            return "reset"

        # if the user greets Dodona, then respond
        # in kind.
        if "hi" in m or \
                "hey" in m or \
                "hello" in m != -1:
            self.bot.send("Hello, " + name + "!")
            return None
        
        # check the status, and return the corresponding
        # function if necessary
        s = self.memory.read("status")
        #if s == "unknown":  return self.unknown(mess)
        if s == "learn":  return self._learn()
        if s:
            if s.startswith("pos"):  
                return self._part_of_speech(mess, s.split("_")[1])

        d = self.memory.read("data")
        k = self.memory.read("topic")
        # if there is no current topic, then decipher one
        # from the most recent message.
        if k == None:
            self._AI(mess)
            if self.memory.read("topic"):
                return None
            else:
                return "reset"

        # if there is a current topic, search for a subtopic
        else:
            self._AI(mess, d, k)
            if self.memory.read("status") == "pos_first":
                return None
            else:
                self.memory.pop("topic")
                self.memory.pop("data")
                return "reset"
Exemple #18
0
                 workers = cpu_count,
                 iter = n_iterations)

# training the word model

word_model.build_vocab(text_clean)
word_model.train(text_clean)

# creating mapping dictionaries
word_idx, idx_word, word_vec = helper.ind(vocab,word_model)

# creating embedding matrix
embed_weight = helper.embed_wt(vocab,word_idx,word_vec,vec_dim)

# sequence
sent_seq_parsed,sent_seq_parsed_pad = helper.tokenize(text_clean,word_idx,maxlen)

# training data
word_list = word_tokenize(text_clean)
n_symbols = len(vocab)+1
sentences_list = []

for sentence in sent_seq_parsed_pad:
    for idx in sentence:
        sentences_list.append(idx)
sentences_seq = []
next_word = []

for i in range(len(sentences_list)-maxlen):
    sentences_seq.append(sentences_list[i: i + maxlen])
    next_word.append(sentences_list[i + maxlen])
 def add_text(self, text, tokenize, max_length):
     content_terms = helper.tokenize(text, tokenize)
     if len(content_terms) > max_length:
         self.query_terms = ['<s>'] + content_terms[:max_length] + ['</s>']
     else:
         self.query_terms = ['<s>'] + content_terms + ['</s>']
Exemple #20
0
    def question(self):
        """
        Parses the user's most recent message,
        and decides what to do based on the
        content and the current status.
        """
        name = self.name
        mess = self.memory.read("message")
        m = tokenize(mess)
        mess = " ".join(m)

        if mess == None: return "reset"

        # if the user wants to exit, then
        # return True (kill the session)
        if mess.startswith("exit") or \
                "bye" in m or \
                "goodbye" in m:
            self.bot.send("Glad to be of help :)", name)
            return "exit"

        # if the user says "nevermind", then
        # clear the session
        if mess.find("nevermind") != -1:
            self.bot.send("Ok.", name)
            self.clear()
            return "reset"

        # if the user greets Dodona, then respond
        # in kind.
        if "hi" in m or \
                "hey" in m or \
                "hello" in m != -1:
            self.bot.send("Hello, " + name + "!")
            return None

        # check the status, and return the corresponding
        # function if necessary
        s = self.memory.read("status")
        #if s == "unknown":  return self.unknown(mess)
        if s == "learn": return self._learn()
        if s:
            if s.startswith("pos"):
                return self._part_of_speech(mess, s.split("_")[1])

        d = self.memory.read("data")
        k = self.memory.read("topic")
        # if there is no current topic, then decipher one
        # from the most recent message.
        if k == None:
            self._AI(mess)
            if self.memory.read("topic"):
                return None
            else:
                return "reset"

        # if there is a current topic, search for a subtopic
        else:
            self._AI(mess, d, k)
            if self.memory.read("status") == "pos_first":
                return None
            else:
                self.memory.pop("topic")
                self.memory.pop("data")
                return "reset"
                   sep='\t',
                   names=['label', 'body_text'],
                   header=None)
data.columns = ['label', 'body_text']

#START DATA PREPROCESSING

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(' '))

data['punct%'] = data['body_text'].apply(lambda x: count_punctuation(x))

data['body_text_clean'] = data['body_text'].apply(
    lambda x: remove_punctuation(x))

data['body_text_tokenized'] = data['body_text_clean'].apply(
    lambda x: tokenize(x))

stopwords = nltk.corpus.stopwords.words('english')

data['body_text_nonstop'] = data['body_text_tokenized'].apply(
    lambda x: remove_stopwords(x, stopwords))

stemmer = nltk.PorterStemmer()

data['body_text_stemmed'] = data['body_text_nonstop'].apply(
    lambda x: stemming(x, stemmer))

wn = nltk.WordNetLemmatizer()

data['body_text_lemmatized'] = data['body_text_nonstop'].apply(
    lambda x: lemmatizing(x, wn))
Exemple #22
0
 def __init__(self, text, max_len, tokenize=False):
     content_terms = helper.tokenize(text, tokenize)
     self.text = content_terms if len(
         content_terms) <= max_len else content_terms[:max_len]
     self.rel_docs = []
Exemple #23
0
epoch = args.epoch
print_nsteps = args.print_nsteps
verbose = args.verbose
ckpt_prefix = args.ckpt_prefix

batch_size = hparams['batch_size']

if os.path.isdir(ckpt_prefix) == False:
    os.mkdir(ckpt_prefix)

ckpt_path = os.path.join(ckpt_prefix, 'model.ckpt')

tgt_sentences, tgt_metadata = data_pipeline(args.tgt_dataset, padding=True)
src_sentences, src_metadata = data_pipeline(args.src_dataset, padding=True)

src_inputs = np.array([tokenize(sentence, src_metadata, source=True, reverse=True)\
                          for sentence in src_sentences])
tgt_outputs = np.array([tokenize(sentence, tgt_metadata, source=False, reverse=False)\
                          for sentence in tgt_sentences])

save_metadata(tgt_metadata, "tgt_metadata.dill")
save_metadata(src_metadata, "src_metadata.dill")

hparams['tgt_vocab_size'] = tgt_metadata.vocab_size
hparams['src_vocab_size'] = src_metadata.vocab_size
hparams['dec_max_time_step'] = tgt_metadata.max_time_step
save_hparams(json_path, hparams)

train_graph = tf.Graph()
eval_graph = tf.Graph()
Exemple #24
0
 def add_text(self, text, tokenize):
     content_terms = helper.tokenize(text, tokenize)
     for i in range(len(content_terms)):
         term = '#' + content_terms[i] + '#'
         for j in range(0, len(term) - 2):
             self.query_terms.append(term[j:j + 3])
parser.add_argument("-ckpt_prefix",
                    "--ckpt_prefix",
                    nargs="?",
                    help="checkpoint path prefix",
                    type=str)
parser.add_argument('-hparams',
                    "--hparams",
                    help="path to hyperparameters file (.json)",
                    type=str)

args = parser.parse_args()

src_metadata = load_metadata('src_metadata.dill')
tgt_metadata = load_metadata('tgt_metadata.dill')

src_inputs = np.array([tokenize(sentence, src_metadata, source=True, reverse=True)\
                          for sentence in src_sentences])

fp = open(args.hparams, 'r')
hparams = json.load(fp)
ckpt_path = os.path.join(args.ckpt_prefix, 'model.ckpt')

infer_graph = tf.Graph()

with infer_graph.as_default():
    infer_model = Model("infer", hparams)

with tf.Session(graph=infer_graph) as sess:
    sess.run(tf.global_variables_initializer())
    infer_model.saver.restore(sess, ckpt_path)
    while True:
Exemple #26
0
# load gram_counts
gram_count = {}
with open(args["gramcnt"], "r") as F:
    for line in F:
        gram_string, count = line.split(",")
        count = int(count)
        if gram_string in gram_label: gram_count[gram_string] = count
print "Finish Loading Gram Counts."

# dump word_embeddings
cPickle.dump(embedding, open(join(args["output"], "sentiment_custom_We"),
                             'wb'))

for file_name in ['train-rootfine', 'dev-rootfine', 'test-rootfine']:
    output = []
    with open(join(args["input"], file_name), "r") as F:
        for line in F:
            vector = line.split(" ")
            label = vector[0]
            sentence = sanitize_line(
                filter_with_alphabet(" ".join(vector[1:]), alphabet))
            for i in range(repeat):
                tokenized_sentence = tokenize(gram_count, sentence,
                                              gram_length, token_weight)
                for j in range(len(tokenized_sentence)):
                    tokenized_sentence[j] = gram_label.get(
                        tokenized_sentence[j], 0)  # 0 is the, not meaningful
                output.append([tokenized_sentence, label])
    cPickle.dump(output, open(join(args["output"], file_name), 'wb'))
    print "Finish Dumping File " + file_name