Example #1
0
 def _get_emb(self, w2v):
     """ Get the avgerage w2v embedding """
     toks = tokenizer(self.opn.lower()) + tokenizer(self.asp.lower())
     self.is_valid = len(toks) > 0
     if not self.is_valid:
         return None
     embs = torch.stack([self._get_tok_emb(tok, w2v) for tok in toks],
                        dim=0)
     return torch.mean(embs, dim=0)
Example #2
0
def PairLoader(pairs, batch_size, w2cloader, max_seq_length=3):
	left = [pair[0] for pair in pairs]
	right = [pair[1] for pair in pairs]
	left_ids = [[w2cloader.toks2ids(tokenizer(col), max_seq_length) for col in row] for row in left]
	right_ids = [[w2cloader.toks2ids(tokenizer(col), max_seq_length) for col in row] for row in right]
	dataset = torch.utils.data.TensorDataset(torch.tensor(left_ids), torch.tensor(right_ids))
	data_iter = torch.utils.data.DataLoader(dataset, batch_size=batch_size)

	return data_iter
def semanticSimilarity(sentenceA, sentenceB, infoContentNorm):
    wordsA = tokenizer(sentenceA)
    wordsB = tokenizer(sentenceB)
    wordSet = set(wordsA).union(set(wordsB))
    wordVectorA = semanticVector(wordsA, wordSet, infoContentNorm)
    wordVectorB = semanticVector(wordsB, wordSet, infoContentNorm)

    semSimilarity = dot(wordVectorA, wordVectorB) / (linalg.norm(wordVectorA) *
                                                     linalg.norm(wordVectorB))
    return semSimilarity
def wordSimilarity(sentenceA, sentenceB):
    wordsA = tokenizer(sentenceA)
    wordsB = tokenizer(sentenceB)
    wordSet = list(set(wordsA).union(set(wordsB)))
    index = {word[1]: word[0] for word in enumerate(wordSet)}
    #sr = r1-r2
    r1 = wordOrder(wordsA, wordSet, index)
    r2 = wordOrder(wordsB, wordSet, index)
    srTemp = linalg.norm(r1 - r2) / linalg.norm(r1 + r2)
    return 1 - srTemp
Example #5
0
def ppindexer(test, gold, PREPS):
    ppindex = -100
    test_words = tokenizer(test)
    gold_words = tokenizer(gold)
    assert len(test_words) == len(gold_words)
    for i, pair in enumerate(zip(test_words, gold_words)):
        if not (pair[0] == pair[1]) and (pair[0] in PREPS and pair[1] in PREPS) :
            correctionpair = (pair[0], pair[1])
            ppindex = i
        else:
            pass
    return ppindex, test_words, gold_words, correctionpair
    def __call__(self, x):
        bow = []
        for sent in splitter(x):
            for i, token in enumerate(tokenizer(sent)):

                if self.remove_nonalpha and not token.isalpha():
                    continue

                if (self.remove_entities and i
                        and token[0] != token[0].lower()):
                    continue

                if (self.remove_stopwords and token.lower() in STOPWORDS):
                    continue

                bow.append(token if not self.lowercase else token.lower())

        _bow = []
        prev = None
        while bow:
            token = bow.pop(0)
            if token == prev:
                continue
            _bow.append(token)
            prev = token

        return ' '.join(_bow)
Example #7
0
    def __call__(self, doc):

        tokens = [
            token for token in tokenizer(doc, self.lang, True)
            if token.isalnum() and len(token) > 0 and not token.isspace()
        ]  # we can eliminate punctuation  as well
        tokens = [token.lower() for token in tokens]

        if self.remove_numbers:
            number_pattern = "[a-zA-z]{,3}\d{6,}"
            tokens = [re.sub(number_pattern, "", token) for token in tokens]

        if self.eliminate_stopwords:
            stopwords = stopword_lists.get_stopwords(lang="tr")
            tokens = [token for token in tokens if token not in stopwords]

        if self.apply_stemming:
            tokens = [tr_stemmer.stem2(token) for token in tokens]

        if self.deasciify:
            tokens = [
                Deasciifier(token).convert_to_turkish() for token in tokens
            ]

        tokens = [token.strip() for token in tokens]
        tokens = [token for token in tokens
                  if len(token) > 0]  # or not token.isspace()]
        return tokens
    def extract(self, name, name_list, text, limit=1, is_wordnet=False):
        if not self.is_name_in_text(name_list, text):
            return {}, 0
        tagList = self.tag_list
        stopwords = self.stops
        wordDict = {}
        filterd_dict = {}
        sents = segmenter(text)
        wordcount = 0
        for sent in sents:
            tokens = tokenizer(sent.lower())
            terms = tagger(tokens)
            for t in terms:
                wordcount += 1
                key = '.'.join(t)
                try:
                    wordDict[key] += 1
                except KeyError:
                    wordDict[key] = 1

        for term_s, count in wordDict.items():
            try:
                word, pos = term_s.split('.')
            except ValueError:
                continue
            if pos[:2] in tagList and word.lower() not in stopwords and len(word) >= 3:
                print word, pos
                if is_wordnet:
                    meanList = self.abstract(word, pos, limit)
                    for w in meanList:
                        filterd_dict[term_s] = count
                else:
                    filterd_dict[term_s] = count

        return filterd_dict, wordcount
Example #9
0
 def parse_comments(comments):
     lines_final = []
     syn = ['Syntax', 'SYNTAX']
     others = ['NOTE', 'Note', 'Comments']
     if comments is not None:
         comm_list = comments.text
         if comm_list:
             split_list = comm_list.split('\n')
             leading_space_stripped = [
                 my_str.strip() for my_str in split_list
             ]
             for line in leading_space_stripped:
                 if line:
                     is_syn_struct = False
                     is_split = False
                     tokens = tokenizer(line)
                     if tokens[0] in syn:
                         is_syn_struct = True
                         is_split = True
                     elif tokens[0] in others:
                         is_syn_struct = False
                         is_split = True
                     elif is_syn_struct:
                         is_split = True
                     if is_split: lines_final.append(line)
                     elif not lines_final: lines_final.append(line)
                     else: lines_final[-1] += ' ' + line
             return lines_final
         return ''
     return ''
Example #10
0
	def get_emb_vec(self, row):
		emb = []
		for field in row:
			for tok in tokenizer(field):
				emb.append(self.tok2emb(tok))
		if len(emb) == 0:
			emb = [self.w_emb[self.unknown_idx]]
		return torch.norm(torch.tensor(emb), dim=0)
 def snippet_tokenize(self, snippet):
     snippet = snippet.lower()
     snippet_tokens = [
         self.clean_word(t) for t in tokenizer(snippet)
         if t not in self.stops and len(t) >= 3 and t != '...'
     ]
     snippet_tokens = [e for e in snippet_tokens if e]
     snippet_freq = dict(FreqDist(snippet_tokens))
     return snippet_freq
 def title_tokenize(self, title):
     title = title.lower()
     title_tokens = [
         self.clean_word(t) for t in tokenizer(title)
         if t not in self.stops and len(t) >= 3
     ]
     title_tokens = [e for e in title_tokens if e]
     title_freq = dict(FreqDist(title_tokens))
     return title_freq
Example #13
0
def ar():
    # Take sys.argv[2] as input which data set should be run.
    # Create a list of the files in the chosen directory.
    directory = sys.argv[2]
    files = os.listdir("../out/" + directory + "/ar/")

    # Create counter, set to zero.
    counter = 0

    # Open file with Arabic stop words.
    sw_in = open("../data/arstoplist.txt")
    sw = sw_in.read().splitlines()
    sw_in.close()

    # Open file with Arabic punctuation.
    punctlist = open("../data/arabpunct.txt").read().splitlines()

    # DELETE THIS Import IsriStemmer()
    # st = ISRIStemmer()

    # Loop over files:
    for f in files:
        counter += 1
        print("Beginning file: " + str(counter))
        allwords = []

        if "txt" in f:
            # Open and read in file.
            f_in = open("../out/" + directory + "/ar/" + f, 'rU')
            lines = f_in.readlines()
            f_in.close()

            # Loop over the lines in the file.
            for line in lines:
                words = []

                # Tokenize the line.
                tokens = tokenizer(line)

                # Loop over the words in the line:
                # Pass if the word appears in the stopwords or punctuation list.
                # Stem all other tokens and append to: words.
                for t in tokens:
                    if t in sw:
                        pass
                    elif t in punctlist:
                        pass
                    else:
                        words.append(t)  # Why stem here???!!!

                allwords.append(words)

            # Write stemmed words to file.
            f_out = open("../out/tokenized/" + directory + "/ar/" + f, 'w')
            for item in allwords:
                f_out.write("\n".join(item))
            f_out.close()
 def is_name_in_text(self, name_list, text):
     lower_text = text.lower()
     sents = segmenter(lower_text)
     word_set = set()
     for sent in sents:
         tokens = tokenizer(sent)
         for t in tokens:
             word_set.add(t)
     for e in name_list:
         if e in word_set:
             return True
     return False
 def is_name_in_text(self, name_list, text):
     lower_text = text.lower()
     sents = segmenter(lower_text)
     word_set = set()
     for sent in sents:
         tokens = tokenizer(sent)
         for t in tokens:
             word_set.add(t)
     for e in name_list:
         if e in word_set:
             return True
     return False
Example #16
0
def en():
    # Take sys.argv[2] as input which data set should be run.
    # Create a list of the files in the chosen directory.
    directory = sys.argv[2]
    files = os.listdir("../out/" + directory + "/en/")

    # Create counter, set to zero.
    counter = 0

    # Import stop words.
    sw_en = stopwords.words('english')

    # DELETE THIS Import Porterstemmer()
    # st = PorterStemmer()

    # Loop over files:
    for f in files:
        counter += 1
        print("Beginning file: " + str(counter))
        allwords = []

        if "txt" in f:
            # Open and read in file.
            f_in = open("../out/" + directory + "/en/" + f, 'rU')
            lines = f_in.readlines()
            f_in.close()

            # Loop over the lines in the opened file:
            # lowercase and tokenize words.
            for line in lines:
                words = []
                line = line.lower()
                tokens = tokenizer(line)

                # Loop over the words in tokens:
                # pass if the word is in the list of stop words.
                # all other words: stem the word (???) and append to list: words.
                for t in tokens:
                    try:
                        if t in sw_en:
                            pass
                        else:
                            words.append(t)
                    except IndexError:
                        words.append(t)

            allwords.append(words)

            # Write the stemmed words to file.
            f_out = open("../out/tokenized/" + directory + "/en/" + f, 'w')
            for item in allwords:
                f_out.write("\n".join(item))
            f_out.close()
Example #17
0
def DataLoader(target_rows, aux_rows, batch_size, w2cloader, max_seq_length=5):
	assert len(target_rows) == len(aux_rows)
	aux_cols = [list(set([r[i] for r in aux_rows])) for i in range(len(aux_rows[0]))]
	aux_c_sizes = [len(c) for c in aux_cols]
	aux_c_counts = [[sum(1 for row in aux_rows if row[i]==val) for val in aux_cols[i]] for i in range(len(aux_cols))]
	aux_weights = [[min(3, val/(min(col)+1.)) for val in col] for col in aux_c_counts]

	# Update batch
	target_ids = [[w2cloader.toks2ids(tokenizer(col), max_seq_length) for col in row] for row in target_rows]
	label_ids = [[aux_cols[i].index(row[i]) for i in range(len(row))] for row in aux_rows]	
	dataset = torch.utils.data.TensorDataset(torch.tensor(target_ids), torch.tensor(label_ids))
	data_iter = torch.utils.data.DataLoader(dataset, batch_size=batch_size)

	return data_iter, aux_c_sizes, aux_weights
Example #18
0
def word_tokenize(text_block, stemmer, stop_words):
    sentences = SENT_RE.findall(text_block)
    sense_phrases = []
    for sentence in sentences:
        sentence = sentence.replace('\'', '').replace('(', ' ') \
            .replace(')', ' ').replace("/", " or ").replace("-", "")

        sentence = TAG_RE.sub('', sentence)
        sentence = "".join((c for c in sentence if 0 < ord(c) < 127))
        sentence_words = [stemmer.stem(word) for word in tokenizer(sentence) if word not in stop_words
                          and re.match(alpha_numeric, word)]
        sense_phrases.append(sentence_words)
        logger.info("Will sense tokenize : %s" % sentence)
    return sense_phrases
Example #19
0
def word_tokenize(text_block, stemmer, stop_words):
    sentences = SENT_RE.findall(text_block)
    sense_phrases = []
    for sentence in sentences:
        sentence = sentence.replace('\'', '').replace('(', ' ') \
            .replace(')', ' ').replace("/", " or ").replace("-", "")

        sentence = TAG_RE.sub('', sentence)
        sentence = "".join((c for c in sentence if 0 < ord(c) < 127))
        sentence_words = [
            stemmer.stem(word) for word in tokenizer(sentence)
            if word not in stop_words and re.match(alpha_numeric, word)
        ]
        sense_phrases.append(sentence_words)
        logger.info("Will sense tokenize : %s" % sentence)
    return sense_phrases
Example #20
0
def tag_sentences(sentences, pos_symbol=False):
    tokenized = []
    for sent in sentences:
        tokenized.append(tokenizer(sent))

    processed_list = tagger(tokenized)

    if not pos_symbol:
        output_list = []
        for sentence in processed_list:
            new_sentence = []
            for word in sentence:
                new_sentence.append((word[_IDX_WORD], POS_TAGS[word[_IDX_SYMBOL]]))
            output_list.append(new_sentence)
    else:
        output_list = processed_list

    return output_list
Example #21
0
 def make_cluster_name(self):
     if self.cluster_name:
         return self.cluster_name
     all_titles = [[w.lower() for w in tokenizer(self.titles[docid])]
                   for docid in self.items()]
     gram_dist = Counter()
     for title_tokens in random.sample(
             all_titles, 50 if 50 < len(all_titles) else len(all_titles)):
         for n in [1, 2, 3, 4, 5]:
             for gram in ngrams(title_tokens, n):
                 tfidf_weight = sum([
                     self.__get_tfidf_weight(token.lower())
                     for token in gram
                 ])
                 if (gram[0].lower() in STOPWORDS
                         or gram[-1].lower() in STOPWORDS or
                     [token for token in gram if not token.isalpha()]):
                     continue
                 gram_dist[gram] += tfidf_weight * n
     if not gram_dist.most_common():
         return None
     self.cluster_name = ' '.join(gram_dist.most_common()[0][0])
     return self.cluster_name
Example #22
0
 def _read_reviews(self, source_file):
     """ Read reviews from file and conduct initial pruning
     """
     entities = set([])
     reviews = []
     num_exts = 0
     with open(source_file, "r", encoding="utf-8") as file:
         for _, line in enumerate(tqdm(file, desc="reviews")):
             review = json.loads(str(line))
             # Process sentences & extractions
             sents = review["sentences"]
             exts = review["extractions"]
             # Filter sentences with NO extractions
             if self.filter_empty:
                 sents = [sents[i] for i in set([e["sid"] for e in exts])]
             # Prune by number of sentences
             if len(sents) < self.s_min or len(sents) > self.s_max:
                 continue
             # Prune by number of extractions
             if len(exts) < self.e_min or len(exts) > self.e_max:
                 continue
             # Process extractions & sentences
             for ext in review["extractions"]:
                 ext["opinion"] = self._process_span(ext["opinion"])
                 ext["aspect"] = self._process_span(ext["aspect"])
             sents = [self.detokenizer.detokenize(toks) for toks in sents]
             # Validate number of tokens per review
             num_tokens = len(tokenizer(" ".join(sents)))
             if num_tokens > self.t_max:
                 continue
             review["sentences"] = sents
             reviews.append(review)
             entities.add(review["ty_id"])
             num_exts += len(exts)
     print("Average number of extractions per review: {}".format(
         num_exts / (0.0 + len(reviews))))
     return reviews, entities
    def extract(self, name, name_list, text, limit=1, is_wordnet=False):
        if not self.is_name_in_text(name_list, text):
            return {}, 0
        tagList = self.tag_list
        stopwords = self.stops
        wordDict = {}
        filterd_dict = {}
        sents = segmenter(text)
        wordcount = 0
        for sent in sents:
            tokens = tokenizer(sent.lower())
            terms = tagger(tokens)
            for t in terms:
                wordcount += 1
                key = '.'.join(t)
                try:
                    wordDict[key] += 1
                except KeyError:
                    wordDict[key] = 1

        for term_s, count in wordDict.items():
            try:
                word, pos = term_s.split('.')
            except ValueError:
                continue
            if pos[:2] in tagList and word.lower(
            ) not in stopwords and len(word) >= 3:
                print word, pos
                if is_wordnet:
                    meanList = self.abstract(word, pos, limit)
                    for w in meanList:
                        filterd_dict[term_s] = count
                else:
                    filterd_dict[term_s] = count

        return filterd_dict, wordcount
Example #24
0
def main(root_path):
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--json', dest='json',
                        default='kvret_train_public.json',
                        help='process json file')
    args = parser.parse_args()
    task = args.json.split('_')[1]

    with open(os.path.join(root_path, args.json)) as f:
        dialogues = json.load(f)

    with open(os.path.join(root_path, 'kvret_entities.json')) as f:
        entities_dict = json.load(f)

    # drop poi and poi_type here.
    global_kb_type = ['distance', 'traffic_info', 'location', 'weather_attribute', 'temperature', "weekly_time",
                      'event', 'time', 'date', 'party', 'room', 'agenda']
    global_temp = []
    di = {}
    # connect infos with '_' and map from original str to str with '_'
    for e in global_kb_type:
        for p in map(lambda x: str(x).lower(), entities_dict[e]):
            if "_" in p and p.replace("_", " ") != p:
                di[p.replace("_", " ")] = p
            else:
                if p != p.replace(" ", "_"):
                    di[p] = p.replace(" ", "_")
    global_temp.append(di)

    example_kbs = []

    for d in dialogues:
        roots = []

        if (d['scenario']['task']['intent'] == "navigate"):  # "schedule" "navigate"
            print("#navigate#")
            temp = []
            names = {}
            # iterate through all kb infos.
            for el in d['scenario']['kb']['items']:
                poi = " ".join(tokenizer(el['poi'].replace("'", " "))).replace(" ", "_").lower()
                slots = ['poi', 'distance', 'traffic_info', 'poi_type', 'address']
                # remvoe "'" and convert to lower
                for slot in slots:
                    el[slot] = " ".join(tokenizer(el[slot].replace("'", " "))).lower()
                names[el['poi']] = poi
                di = {
                    el['distance']: el['distance'].replace(" ", "_"),
                    el['traffic_info']: el['traffic_info'].replace(" ", "_"),
                    el['poi_type']: el['poi_type'].replace(" ", "_"),
                    el['address']: el['address'].replace(" ", "_"),
                }
                print(
                    "0 " + di[el['distance']] + " " + di[el['traffic_info']] + " " + di[el['poi_type']] + " poi " + poi)
                print("0 " + poi + " distance " + di[el['distance']])
                print("0 " + poi + " traffic_info " + di[el['traffic_info']])
                print("0 " + poi + " poi_type " + di[el['poi_type']])
                print("0 " + poi + " address " + di[el['address']])
                temp.append(di)

                # construct tree root for each kb item
                root = Node(poi, 'poi', layer=0)
                # except poi again
                for slot in slots[1:]:
                    root.children.append(Node(di[el[slot]], slot, layer=1))
                roots.append(root)

            # use for latter entity matching ?
            temp += global_temp

            # drop last one.
            if (len(d['dialogue']) % 2 != 0):
                d['dialogue'].pop()

            j = 1
            for i in range(0, len(d['dialogue']), 2):
                user = "******".join(cleaner(tokenizer(str(d['dialogue'][i]['data']['utterance']).lower())))
                bot = " ".join(cleaner(tokenizer(str(d['dialogue'][i + 1]['data']['utterance']).lower())))
                # replace entity names with names joined by "_"
                bot, user = entity_replace(temp, bot, user, names)
                navigation = global_kb_type  # ['distance','traffic_info']
                nav_poi = ['address', 'poi', 'type']
                gold_entity = []
                for key in bot.split(' '):
                    for e in navigation:
                        for p in map(lambda x: str(x).lower(), entities_dict[e]):
                            if (key == p):
                                gold_entity.append(key)
                            elif (key == str(p).replace(" ", "_")):
                                gold_entity.append(key)

                    for e in entities_dict['poi']:
                        for p in nav_poi:
                            if (key == str(e[p]).lower()):
                                gold_entity.append(key)
                            elif (key == str(e[p]).lower().replace(" ", "_")):
                                gold_entity.append(key)
                # gold entity for each turn of dialogue.
                gold_entity = list(set(gold_entity))
                if bot != "" and user != "":
                    print(str(j) + " " + user + '\t' + bot + '\t' + str(gold_entity))
                    j += 1
            print("")

        elif (d['scenario']['task']['intent'] == "weather"):  # "weather"
            print("#weather#")
            temp = []
            j = 1
            print("0 today " + d['scenario']['kb']['items'][0]["today"])
            today = d['scenario']['kb']['items'][0]["today"]
            for el in d['scenario']['kb']['items']:

                for el_key in el.keys():
                    el[el_key] = " ".join(tokenizer(el[el_key])).lower()
                loc = el['location'].replace(" ", "_")
                di = {el['location']: loc}
                temp.append(di)
                days = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]
                for day in days:
                    print("0 " + loc + " " + day + " " + el[day].split(',')[0].rstrip().replace(" ", "_"))
                    print("0 " + loc + " " + day + " " + el[day].split(',')[1].split(" ")[1] + " " +
                          el[day].split(',')[1].split(" ")[3])
                    print("0 " + loc + " " + day + " " + el[day].split(',')[2].split(" ")[1] + " " +
                          el[day].split(',')[2].split(" ")[3])

                # construct tree root for each kb item
                # root = Node(loc, 'location', layer=0)
                slots = ['weather', 'high', 'low']
                for day in days:
                    root = Node(loc, 'location', layer=0)
                    '''
                    tmp = Node(el[day], day, layer=1)
                    val = el[day]
                    splits = [item.strip() for item in val.split(',')]
                    tmp.children.append(Node(splits[0], 'weather', layer=2))
                    tmp.children.append(Node(splits[1], splits[1].split()[0], layer=2))
                    tmp.children.append(Node(splits[2], splits[2].split()[0], layer=2))
                    root.children.append(tmp)
                    '''
                    # change weather to 1-layer tree.
                    val = el[day]
                    splits = [item.strip() for item in val.split(',')]
                    root.children.append(Node(day, 'date', layer=1))
                    # more delicate for vals
                    root.children.append(Node(splits[1], splits[1].split()[0], layer=1))
                    root.children.append(Node(splits[2], splits[2].split()[0], layer=1))
                    # miss this in original dataset...
                    if today == day:
                        root.children.append(Node('yes', 'today', layer=1))
                    else:
                        root.children.append(Node('no', 'today', layer=1))

                    roots.append(root)

            temp += global_temp

            if (len(d['dialogue']) % 2 != 0):
                d['dialogue'].pop()

            for i in range(0, len(d['dialogue']), 2):
                user = "******".join(cleaner(tokenizer(str(d['dialogue'][i]['data']['utterance']).lower())))
                bot = " ".join(cleaner(tokenizer(str(d['dialogue'][i + 1]['data']['utterance']).lower())))
                bot, user = entity_replace(temp, bot, user)
                weather = global_kb_type  # ['location', 'weather_attribute','temperature',"weekly_time"]
                gold_entity = []
                for key in bot.split(' '):
                    for e in weather:
                        for p in map(lambda x: str(x).lower(), entities_dict[e]):
                            if (key == p):
                                gold_entity.append(key)
                            elif (key == str(p).replace(" ", "_")):
                                gold_entity.append(key)
                gold_entity = list(set(gold_entity))
                if bot != "" and user != "":
                    print(str(j) + " " + user + '\t' + bot + '\t' + str(gold_entity))
                    j += 1

            print("")

        if (d['scenario']['task']['intent'] == "schedule"):  # "schedule"
            print("#schedule#")
            temp = []
            names = {}
            j = 1
            # for all kb triple
            if (d['scenario']['kb']['items'] != None):
                for el in d['scenario']['kb']['items']:
                    for el_key in el.keys():
                        el[el_key] = " ".join(tokenizer(el[el_key])).lower()
                    ev = el['event'].replace(" ", "_")
                    names[el['event']] = ev
                    slots = ['time', 'date', 'party', 'room', 'agenda']
                    di = {}
                    for slot in slots:
                        if el[slot] == "-":
                            continue
                        if slot == "time":
                            print("0 " + ev + " " + slot + " " + el[slot].replace(" ", ""))
                            di[el[slot]] = el[slot].replace(" ", "")
                        else:
                            print("0 " + ev + " " + slot + " " + el[slot].replace(" ", "_"))
                            di[el[slot]] = el[slot].replace(" ", "_")
                    temp.append(di)

                    root = Node(ev, 'event', layer=0)
                    for slot in slots:
                        tmp = Node(el[slot], slot, layer=1)
                        root.children.append(tmp)

                    roots.append(root)

            temp += global_temp

            if (len(d['dialogue']) % 2 != 0):
                d['dialogue'].pop()

            for i in range(0, len(d['dialogue']), 2):
                user = "******".join(cleaner(tokenizer(str(d['dialogue'][i]['data']['utterance']).lower())))
                bot = " ".join(cleaner(tokenizer(str(d['dialogue'][i + 1]['data']['utterance']).lower())))
                bot, user = entity_replace(temp, bot, user, names)
                calendar = global_kb_type  # ['event','time', 'date', 'party', 'room', 'agenda']
                gold_entity = []
                for key in bot.split(' '):
                    for e in calendar:
                        for p in map(lambda x: str(x).lower(), entities_dict[e]):
                            if (key == p):
                                gold_entity.append(key)
                            elif (key == str(p).replace(" ", "_")):
                                gold_entity.append(key)
                gold_entity = list(set(gold_entity))
                if bot != "" and user != "":
                    print(str(j) + " " + user + '\t' + bot + '\t' + str(gold_entity))
                    j += 1

            print("")
        # add to example kbs.
        example_kbs.append(roots)

    # next step : save to file.
    with open(os.path.join(root_path, '{}_example_kbs.dat'.format(task)), 'wb') as f:
        pickle.dump(example_kbs, f)
 def title_tokenize(self, title):
     title = title.lower()
     title_tokens = [self.clean_word(t) for t in tokenizer(title) if t not in self.stops and len(t) >= 3]
     title_tokens = [e for e in title_tokens if e]
     title_freq = dict(FreqDist(title_tokens))
     return title_freq
Example #26
0
def main():

    # Open all files related to removing stop words or punctuation from the data.
    sw_in = open(r"../data/arstoplist.txt")
    stopwords = sw_in.read().splitlines()
    punctlist = open("../data/arabpunct.txt").read().splitlines()

    directory = sys.argv[1]

    # Give location of input files.
    files = os.listdir("../out/" + directory + "/ar/")

    st = ISRIStemmer()
    #rx_en = re.compile(r'\D+')
    tokens = []
    counter = 0
    filelist = []

    for f in files:
        if "txt" in f:
            counter += 1
            f_in = open("../out/" + directory + "/ar/" + f, 'rU')
            lines = f_in.readlines()
            f_in.close()
            filelist.extend(lines)

    print("Files read.")

    stemmed = {}
    types = {}

    f_out = open('../out/testset-tokenized-' + directory + '.txt', 'w')
    compl_list = []

    for line in filelist:
        #line = line.strip()
        #tokenize = word_tokenize(line)

        # Tokenize the text.
        tokenize = tokenizer(line)

        #tokenize.sort() # Comment this out after the test-set has been used?

        # Define all patterns that shall be excluded.
        rx_ar = re.compile(
            u'^[\u0621-\u064A]+$'
        )  # This exludes Arabic words that have numbers attached to them.
        rx_ar2 = re.compile(u'^(\u0622{2,})')

        for w in tokenize:
            if len(w) == 1:
                pass
            elif rx_ar2.match(w):
                pass
            elif rx_ar.match(w):
                f_out.write(w + "\n")
                compl_list.append(w)
            else:
                pass
    f_out.close()

    # wieder einfügen

    for w in compl_list:
        types[w] = 0
        #if punctlist[0] in compl_list or punctlist[1] in compl_list or punctlist[2] or punctlist[3] in compl_list:
        #    if len(w) > 1: # ERROR
        #        new_w = w[:-1] # ERROR! This strips off Arabic letters although they are not in the punctlist
        #        types[new_w] = 0
        #        tokens.append(new_w)
        #    else:
        #        types[w] = 0
        #        tokens.append(w)

    print(str(len(types)) + " different words.")
    print("Punctuation separated.")

    # Here the actual stemming happens.
    verbs = {}
    c = -1
    for w in types:
        c += 1
        if w not in stopwords:
            stm = st.stem(w)
            stemmed[w] = stm
            verbs[stm] = 0
        if c % 10000 == 0:
            print(str(c) + " words stemmed.")
    print("File stemmed.")

    # print the stemmed words and their unstemmed versions to a file
    f_out = open('../out/stem_tok_' + directory + '.txt', 'w')
    wordlist = []
    for w in verbs.keys():
        if len(
                w
        ) > 4:  # Don't save words that are longer than 4 letters. Verbs in Arabic are usually 3 letters long. Ivery rare cases they can be 2 or 4 letters long as well.
            pass
        else:
            wordlist.append(w)
            #f_out.write(w + "\t" + stemmed[w])
            #f_out.write(w + "\n")
    wordlist.sort()
    for w in wordlist:
        f_out.write(w + "\n")
    f_out.write("No. of verbs:" +
                str(len(wordlist)))  # Really verbs? Why not wordlist?
    f_out.close()

    # handle some corpora stats
    corp_stat = Counter(tokens)
    for w in list(corp_stat.keys())[0:11]:
        print("token: " + w + "\tno.: " + str(corp_stat[w]))
 def __init_tools(self):
     test = "Just a test not for printing out or other use "
     tagger(tokenizer(test))
     segmenter(test)
     test = wn.synsets('test')
 def snippet_tokenize(self, snippet):
     snippet = snippet.lower()
     snippet_tokens = [self.clean_word(t) for t in tokenizer(snippet) if t not in self.stops and len(t) >=3 and t != '...']
     snippet_tokens = [e for e in snippet_tokens if e]
     snippet_freq = dict(FreqDist(snippet_tokens))
     return snippet_freq
Example #29
0
for e in global_kb_type:
    for p in map(lambda x: str(x).lower(), entities_dict[e]):       
        if "_" in p and p.replace("_"," ")!=p:
            di[p.replace("_"," ")] = p 
        else:
            if p!=p.replace(" ","_"):
                di[p] = p.replace(" ","_")
global_temp.append(di)

for d in dialogues:
    if(d['scenario']['task']['intent']=="navigate"): #"schedule" "navigate"
        print("#navigate#")
        temp = []
        names = {}
        for el in d['scenario']['kb']['items']:
            poi = " ".join(tokenizer(el['poi'].replace("'"," "))).replace(" ", "_").lower()
            slots = ['poi','distance','traffic_info','poi_type','address']
            
            for slot in slots:
                el[slot] = " ".join(tokenizer(el[slot].replace("'"," "))).lower()
            
            names[el['poi']] = poi

            di = {
                el['distance']: el['distance'].replace(" ", "_"),
                el['traffic_info']: el['traffic_info'].replace(" ", "_"),
                el['poi_type']: el['poi_type'].replace(" ", "_"),
                el['address']: el['address'].replace(" ", "_"),
            }

            print("0 "+di[el['distance']]+" "+di[el['traffic_info']]+" "+di[el['poi_type']]+" poi "+poi)
 def __init_tools(self):
     test = "Just a test not for printing out or other use "
     tagger(tokenizer(test))
     segmenter(test)
     test = wn.synsets('test')
Example #31
0
def extract_words(string, lowercase=True, rm_num=True):
    return [
        w.lower() if lowercase else w for w in tokenizer(string)
        if not rm_num or w.isalpha()
    ]
Example #32
0
            for c in column_names:
                entity_set.append(str(kb[c]).lower())
                if c != "name":
                    print("0 " + str(kb['name']).lower() + " " + c + " " +
                          str(kb[c]).lower() + " name " + c)
        entity_set = list(set(entity_set))

        #dialog
        if (len(d['dialogue']) % 2 != 0):
            d['dialogue'].pop()

        j = 1
        for i in range(0, len(d['dialogue']), 2):
            user = "******".join(
                cleaner(
                    tokenizer(
                        str(d['dialogue'][i]['data']['utterance']).lower())))
            bot = " ".join(
                cleaner(
                    tokenizer(
                        str(d['dialogue'][i +
                                          1]['data']['utterance']).lower())))
            gold_entity = []
            for key in bot.split(' '):
                if key in entity_set:
                    gold_entity.append(key)
            gold_entity = list(set(gold_entity))
            if user != "" and bot != "":
                print(
                    str(j) + " " + user + '\t' + bot + '\t' + str(gold_entity))
                j += 1
        print("")
Example #33
0
def tokenize_string(s):
    return tokenizer().tokenize(s)
Example #34
0
import sys

from nltk import word_tokenize as tokenizer

with open(sys.argv[1]) as f_in, open(sys.argv[2], "w") as f_out:
    for c, l in enumerate(f_in):
        f_out.write(" ".join(tokenizer(l))+"\n")
        #if c % 1000 == 0:
        #    print(c)
Example #35
0
        for p in map(lambda x: str(x).lower(), entities_dict[e]):
            if "_" in p and p.replace("_", " ") != p:
                di[p.replace("_", " ")] = p
            else:
                if p != p.replace(" ", "_"):
                    di[p] = p.replace(" ", "_")
    global_temp.append(di)

    for d in dialogues:
        if (d['scenario']['task']['intent'] == "navigate"
            ):  #"schedule" "navigate"
            print("#navigate#")
            temp = []
            names = {}
            for el in d['scenario']['kb']['items']:
                poi = " ".join(tokenizer(el['poi'].replace("'", " "))).replace(
                    " ", "_").lower()
                slots = [
                    'poi', 'distance', 'traffic_info', 'poi_type', 'address'
                ]
                for slot in slots:
                    el[slot] = " ".join(tokenizer(el[slot].replace(
                        "'", " "))).lower()
                names[el['poi']] = poi
                di = {
                    el['distance']: el['distance'].replace(" ", "_"),
                    el['traffic_info']: el['traffic_info'].replace(" ", "_"),
                    el['poi_type']: el['poi_type'].replace(" ", "_"),
                    el['address']: el['address'].replace(" ", "_"),
                }
                print("0 " + di[el['distance']] + " " +
Example #36
0
def main():

    # Define which corpora to work with via sys.argv[1]
    corpora = sys.argv[1]

    # Define input data.
    k50 = "../out/mallet/testdez/" + corpora + "-50.txt"
    k100 = "../out/mallet/testdez/" + corpora + "-100.txt"
    k200 = "../out/mallet/testdez/" + corpora + "-200.txt"

    # Load ISRIStemmer.
    st = ISRIStemmer()

    # Create lists: all_plots, all_means.
    all_plots = []
    all_means = []

    # Create for loop over the three files.
    for i in (k50, k100, k200):
        # Open file, read it into variable f, close file.
        f_in = open(i)
        f = f_in.readlines()
        f_in.close()

        # Create lists: words, stemlist.
        words = []
        stemlist = []

        # Loop over the lines in f. Tokenize words, delete the numbers at the
        # beginning of each line (0:4). Append line to words.
        for line in f:
            line = tokenizer(line)
            del line[0:4]
            words.append(line)

        # Loop over words. Stem each word and append to stemlist.
        for listitem in words:
            stems = []
            for w in listitem:
                r = st.stem(w)
                stems.append(r)
            stemlist.append(stems)

        # Create lists: score, plotdata.
        score = []
        plotdata = []

        # Loop over lists in stemlist. Create a dictionary: d.
        # Loop over the words in topic:
        # if word is in d: add 1 to its value in d.
        # else: add word to d.
        for topic in stemlist:
            d = {}
            for item in topic:
                if item in d:
                    d[item] += 1
                else:
                    d[item] = 1

            # Get the value of each word in d and append it to plotdata.
            maximum = max(d, key=d.get)
            plotdata.append(d[maximum])

            # Calculate the score: 1 / len(d).
            # Append each d_score to score.
            d_score = 1 / len(d)
            score.append(d_score)

        # Calculate the mean of score. Append to all_means.
        mean = np.mean(score)
        all_means.append(mean)

        # Append plotdata to all_plots.
        all_plots.append(plotdata)
        print(plotdata)

    # Create figure: boxplot with data from "all_plots".
    xtick50 = "k=50, mean score over \n all topics: " + str(
        round(all_means[0], 4))
    xtick100 = "k=100, mean score over \n all topics: " + str(
        round(all_means[1], 4))
    xtick200 = "k=200, mean score over \n all topics: " + str(
        round(all_means[2], 4))
    fig = plt.figure(1, figsize=(9, 6))
    ax = fig.add_subplot(111)
    ax.boxplot(all_plots)
    ax.set_xticklabels([xtick50, xtick100, xtick200])
    ax.set_yticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    ax.set_ylabel("Highest value of root repetition per topic",
                  rotation='vertical')
    ax.set_xlabel("k = topics")
    ax.set_title("UN")
    fig.savefig('../out/mallet/figures/testdez/un.png', bbox_inches='tight')
Example #37
0
def word_dist(text):
    return Counter(
        [w for w in tokenizer(text.lower())
         if w.isalpha() and len(w) > 3]
    )