Exemple #1
0
def main():
  tweaker = AGFLTweaker()
  toks = word_tokenize("@John I am goin' to the #store, *catfeesh*?")
  tweaker.prune(toks)
  print toks
  tweaker.deprune(toks)
  print toks
  print pos_tag(word_tokenize("I expect it to go: omg, diaf and stuff."))
  print pos_tag(word_tokenize("Foo dogs by way of stillonlyjacks."))

  print pos_tag(word_tokenize("If I wore a new band's shirt to the band's concert, does that make me lame?"))
  print pos_tag(word_tokenize("|Stinging jets, part words, part pictures, kept shooting at his brain.... .... .."))
Exemple #2
0
 def agfl_join(self, agfl_tags, tokens):
   # AGFL can join or split words/urls. Rejoin split ones.
   offset = 0
   n = 0
   did_replace = False
   while n < len(tokens):
     if n-offset >= len(agfl_tags): break
     word_chunk = agfl_tags[n-offset][0].lower()
     tags_joined = [agfl_tags[n-offset][1]]
     add = len(word_tokenize(" "+agfl_tags[n-offset][0]+" "))-1
     #add = (len(agfl_tags[n-offset][0].split())-1)
     #if add: print "Adding "+str(add)+" for "+agfl_tags[n-offset][0]
     for a in xrange(n+1-offset, len(agfl_tags)):
       tags_joined.append(agfl_tags[a][1])
       word_chunk += agfl_tags[a][0].lower()
       #print word_chunk+" == "+tokens[n].lower()
       if tokens[n].lower() == word_chunk:
         for i in xrange(n-offset,a+1):
            agfl_tags.pop(n-offset)
         tag = (tokens[n], self.tag_vote(tags_joined))
         agfl_tags.insert(a-1, tag)
         did_replace = True
         break
     offset += add
     n += add
     n += 1
   return did_replace
Exemple #3
0
 def UnitTest(cls, norm=None):
     # Does it make me disturbed that these are the first sentences that came
     # to mind? Somewhat troubling...
     strings = [
         "Hi there. Gonna getcha. I've decided you'll die tonight.",
         "r u scared yet? B/c Ill rip our ur guts.",
         "Whatcha up2? We're gonna go on a killin' /spree/.",
         "Holy crap dood.",
         "Are you going out?",
         "#Hi @I love/hate $0 http://yfrog.com/a3ss0sa *always* don't /you/....",
         "r u going out?",
     ]
     if not norm:
         norm = TokenNormalizer()
     tokens = []
     norm_tokens = []
     for s in strings:
         t = word_tokenize(s)
         tokens.append(t)
         print s
     print ""
     for t in tokens:
         nt = norm.normalize_tokens(t)
         norm_tokens.append(nt)
         print nt
     print ""
     for nt in norm_tokens:
         norm._count_tokens(nt)
     denorm_tokens = []
     for nt in norm_tokens:
         dt = norm.denormalize_tokens(nt)
         denorm_tokens.append(dt)
         print dt
     for dt in denorm_tokens:
         print word_detokenize(dt)
Exemple #4
0
 def tokenize(self):  #Function to tokenize text
     text = open("brown.txt", "r").read()
     self.sents = word_tokenize(text)
     for sent in self.sents:
         sent[:0] = ['START',
                     'START']  #Append START token to beginning of list
         sent.append('STOP')  #Append STOP token to end of list
Exemple #5
0
def gen_all_paraphrases(lv):
	with open(source) as fin:
		sents = dict()
		for line in fin:
			l = line.rstrip('\n ')
			if l not in sents:
				sents[l]  = dict()
				sents[l]['num'] = 0
			sents[l]['num']+=1
	if conservative:
		baseline = lv.sent_rescore([['', x] for x in sents])
		for s in baseline:
			sents[s[2]]['baseline'] = s[0]
		print('baseline complete')

	for i, sent in enumerate(sents):
		sys.stdout.write('\rParaphrasing {}/{}'.format(i + 1, len(sents)))
		words = tokenizer.word_tokenize(sent)
		
		if conservative:
			lines = lv.fst_alter_sent(words, n_best, cutoff = sents[sent]['baseline'])
		else:
			lines = lv.fst_alter_sent(words, n_best)
		
		sents[sent]['para'] = lines

	print()
	with open('dstc6_100_parafst.pickle', 'wb') as pout:
		pickle.dump(sents, file = pout)

	with open('dstc6_100_parafst.txt', 'w') as fout:
		for line in sents:
			for x in sents[line.rstrip('\n ')]['para']:
				fout.write(x[2] + '\n')
def extract_summary(json_data, summary_key, entity_dict):
    summary_list = []

    for game in json_data:
        summary = game.get(summary_key, None)
        assert summary is not None
        words = ' '.join(summary).strip().split()
        result = []
        idx = 0
        while idx < len(words):
            if words[idx] in entity_dict:
                length = 1
                while idx + length <= len(words) and ' '.join(
                        words[idx:idx + length]) in entity_dict:
                    length += 1
                length -= 1
                result.append('_'.join(words[idx:idx + length]))
                idx += length
            else:
                result.append(words[idx])
                idx += 1

        result_tokens = word_tokenize(' '.join(result), language='english')
        summary_list.append(result_tokens)
    return summary_list
Exemple #7
0
def search_book(update: Update, context: CallbackContext):
    sess = search.get_session(update.message.from_user.id)
    words = tokenizer.word_tokenize(
        update.message.text, tokenizer.guess_language(update.message.text))
    sess.search(words)
    text = "\n".join(
        map(lambda x: f"{x.authors} - {x.title} /info{x.book_name}",
            sess.search_result))
    update.message.reply_text(text)
Exemple #8
0
 def agfl_split(self, agfl_tags):
   new_agfl_tags = []
   for i in xrange(len(agfl_tags)):
     if not agfl_tags[i][1]:
       # Split it.
       toks = word_tokenize(" "+agfl_tags[i][0]+" ")
       for i in toks: new_agfl_tags.append((i, ""))
     else:
       new_agfl_tags.append(agfl_tags[i])
   return new_agfl_tags
Exemple #9
0
def pos_tag(tokens, try_agfl=True, reject_agfl_fails=True,
            nltk_fallback=True):
  if try_agfl and agfl.agfl_ok():
    detoked = word_detokenize(tokens)
    sentences = nltk.sent_tokenize(detoked)
    all_tags = []
    for s in sentences:
      stokens = word_tokenize(s)
      tweaker = AGFLTweaker()
      tweaker.prune(stokens)
      nltk_tags = nltk.pos_tag(stokens)
      tweaker.agfl_fix(stokens, nltk_tags)
      s = word_detokenize(stokens)
      if not s:
        print "Empty string for: "+str(stokens)
        continue
      #print "Parsing: |"+s+"|"
      agfl_tree = agfl.parse_sentence(s)
      # XXX: We can re-try failed '?' with '.'..
      if not agfl_tree:
        print "AGFL Parse fail for |"+s+"|"
        if not reject_agfl_fails:
          all_tags.extend(tweaker.deprune(nltk.pos_tag(stokens)))
        else:
          return None
      else:
        tags = agfl_tree.pos_tag()
        tags = tweaker.agfl_split(tags)
        did_join = tweaker.agfl_join(tags, stokens)
        if nltk_fallback: tweaker.agfl_repair(tags, nltk_tags)
        tweaker.deprune(tags)
        # Verify that we have labels for everything.
        # If some are still missing, drop.
        if tags:
          for t in tags:
            if not t[1]:
              print "Tag fail for: |"+s+"|"
              print str(tags)
              if did_join: print "Failed with attempted join: "+str(stokens)
              return None
          all_tags.extend(tags)
        else:
          print "Tag fail for |"+s+"|"
          return None
    return all_tags
  else:
    if try_agfl:
      print "AGFL not found/functional. Falling back to nltk.pos_tag()"
    return nltk.pos_tag(tokens)
Exemple #10
0
def main(argv):
    fstfname = ''
    fname = ''

    try:
        opts, args = getopt.getopt(argv, "hu:f:")
    except getopt.GetoptError:
        print("awer.py -u <unigram_probabilities> -f <language_model_fst>")
        sys.exit(1)
    for opt, arg in opts:
        if opt == '-h':
            print("awer.py -u <unigram_probabilities> -f <language_model_fst>")
            sys.exit()
        elif opt == '-u':
            fname = arg
        elif opt == '-f':
            fstfname = arg

    if fname == '' or fstfname == '':
        print("awer.py -u <word_vectors_txt> -f <language_model_fst>")
        sys.exit(1)

    lv = AlterSent(fname, fstfname, 50000)
    #print("Ready")
    totalerr = 0
    linecnt = 0
    for line in sys.stdin:
        linecnt += 1
        words = tokenizer.word_tokenize(line)
        lines = lv.fst_alter_sent(words, 1)
        toks = lines[0][1].split()
        err = 0
        for i in range(len(words)):
            if words[i] != toks[i]:
                err += 1

        if len(words) > 0:
            totalerr += err / len(words)
    if linecnt > 0:
        totalerr = totalerr / linecnt
    print("AWER: %.5f" % totalerr)
def main(argv):
    fname = ''

    try:
        opts, args = getopt.getopt(argv, "hv:")
    except getopt.GetoptError:
        print("lexalter.py -v <word_vectors_txt>")
        sys.exit(1)
    for opt, arg in opts:
        if opt == '-h':
            print("lexalter.py -v <word_vectors_txt>")
            sys.exit()
        elif opt == '-v':
            fname = arg

    if fname == '':
        print("lexalter.py -v <word_vectors_txt>")
        sys.exit(1)

    lv = AlterLex(fname, 50000)

    # get a main word and some context words from stdin
    for line in sys.stdin:
        print()

        words = tokenizer.word_tokenize(line)
        mainword = words.pop(0)

        # get the alternatives
        nearlist = lv.alter(mainword, words, 10)

        # print the alternatives
        if nearlist != None:
            for (idx, w) in enumerate(nearlist):        
                print(w[1])

        print()
        print('--------------')
        print()
Exemple #12
0
def main():
    params = parser.parse_args()

    print('Processing...')
    lv = AlterSent(params.vectors, params.fst_lm, params.onmt_dir,
                   params.onmt_lm, params.kenlm, 50000)
    print("Ready")
    try:
        while True:
            line = input()
            if line.rstrip(' \n') == '':
                continue
            print()
            words = tokenizer.word_tokenize(line)
            lines = lv.fst_alter_sent(words, 100)

            for i, (newscore, score, sent) in enumerate(lines):
                print(i, ':', '%.3f' % newscore, ':', '%.3f' % score, ':',
                      sent.encode())

            print()
    except EOFError:
        pass
def process(input_folder, type, output_folder):
    updated_json = open(os.path.join(output_folder, type + ".json"),
                        mode="w",
                        encoding="utf-8")
    file_list = os.listdir(input_folder)
    for filename in file_list:
        if type in filename:
            print("filename", filename)
            json_file = open(os.path.join(input_folder, filename),
                             mode="r",
                             encoding="utf-8")
            data = json.load(json_file)
            upd_trdata = []
            for entry_index, entry in enumerate(data):
                summary = entry['summary']
                summary = detokenize(summary)
                summary = " ".join(word_tokenize(summary))
                upd_entry = entry
                upd_entry['summary'] = summary
                upd_trdata.append(upd_entry)
                if entry_index % 50 == 0:
                    print(entry_index)
            json.dump(upd_trdata, updated_json)
Exemple #14
0
def create_json(input_folder, input_summaries, output_folder):
    for filename in os.listdir(input_folder):
        d = None
        with codecs.open(input_folder+filename) as json_data:
            d = json.load(json_data)
        print('filename',input_folder+filename)
        output = []
        for entry in d:
            datetime_object = datetime.strptime(entry['day'], '%m_%d_%y')
            html_file_name = []
            html_file_name.append(datetime_object.strftime("%Y%m%d"))
            visname_homename = entry['vis_name'].replace(" ", "_") + "-" + entry['home_name'].replace(" ", "_")
            visname_homename = visname_homename.replace('D-backs', 'Diamondbacks')
            html_file_name.append(visname_homename)
            html_file_name.append(str(entry['vis_line']['team_runs']) + "-" + str(entry['home_line']['team_runs']))

            files = glob.glob(input_summaries+"*" +"_".join(html_file_name))
            if len(files) < 1:
                print(input_summaries+"*"+"_".join(html_file_name) + " not found")
            elif len(files) > 1:
                print(input_summaries + "*" + "_".join(html_file_name) + " multiple found")
            else:
                fname = files[0]
                with codecs.open(fname, encoding='utf-8') as f:
                    content = f.readlines()
                updated_content = []
                for line in content:
                    words = word_tokenize(detokenize(line.strip().split()))
                    updated_content.append(" ".join(words))
                text = " *NEWPARAGRAPH* ".join(updated_content)
                entry['summary'] = text.split()
                output.append(entry)

        if len(output) > 0:
            with codecs.open(output_folder+'combined_'+filename, 'w+') as outfile:
                json.dump(output, outfile)
            outfile.close()
Exemple #15
0
 def __get_words(self):
     text = self.authors + " " + self.title + " " + self.annotation
     self.words = tokenizer.word_tokenize(text, self.lang)
     if self.words == None:
         self.words = set()
Exemple #16
0
    model = Embeddings('../product2vec2/embeddings/all/vecs.npy')
    dimension = model._vecs.shape[1]
    n_max = len(products)
    j = 0
    fwrite('Going through products to extract text embeddings... \n')
    for line in csv_file:
        j += 1
        if not j % 100000:
            fwrite('\t%d\n' % j)
            sys.stdout.flush()
        L = line.lower().split(';')
        idx = L[0]
        if idx in products:
            raw_product = dict([(k, v.decode('utf-8'))
                                for k, v in zip(all_keys, L)])
            product = dict([(k, word_tokenize(raw_product[k]))
                            for k in product_keys])
            vecs = text_embedding(product)
            products[idx]["text_emb"] = vecs
            products[idx]['product'] = json.dumps(raw_product)

    fwrite('Done\n')
    images_path = 'images/img/training'
    break_all = False
    K = 0
    fwrite('Retrieving image paths... ')
    for (dirpath, dirnames, filenames) in walk(images_path):
        for f in filenames:
            idx = f.split('.')[0]
            if idx in products:
                products[idx]['image_path'] = join(dirpath, f)
Exemple #17
0
#!/usr/bin/env python3

import sys
import tokenizer

v = {}
oov = 0

with open(sys.argv[1], 'r', encoding='utf-8') as f:
    for line in f:
        toks = tokenizer.word_tokenize(line)
        for t in toks:
            if t not in v:
                v[t] = 1

print("Types in training set:", len(v))

with open(sys.argv[2], 'r', encoding='utf-8') as f:
    for line in f:
        toks = tokenizer.word_tokenize(line)
        for t in toks:
            if t not in v:
                oov += 1

print("OOVs:", oov)
        ids_test.append(L[0])
        for cat in range(3):
            y_tests[cat].append(L[1+cat])
    testset.close()

    data = np.load('r_similarity_data').all()
    
    # Feature Extraction
    tf_train = []
    df = Counter()
    X_train_img = []
    for idx in ids_train:
        product = json.loads(data[idx]['product'])
        X_train_img.append(data[idx]['image_emb'])
        description = product['Description']
        tokenized = word_tokenize(description)
        tfs = {}
        for w,c in Counter(tokenized).iteritems(): 
            tfs[w] = float(c) / len(tokenized)
        for w in set(tokenized):
            df[w] += 1
        tf_train.append(tfs)
    X_train_img = np.array(X_train_img)
    D = len(ids_train)
    idfs = dict((k, np.log(float(D)/df[k])) for k in df)
    del df
    vocab = idfs.keys()
    vocab_dict = dict((k,v) for v,k in enumerate(vocab))
    vocab_size = len(vocab)
    
    X_train_txt = []
Exemple #19
0
from time import time
from tokenizer import word_tokenize

text = open("brown.txt", "r").read()
t1 = time()
tokens = word_tokenize(text)
t2 = time()
print("Time taken to tokenize: ", t2 - t1)
input("Press Enter to view tokens")
print(tokens)
 sys.stdout.flush()
 model = Embeddings('../product2vec2/embeddings/all/vecs.npy')
 dimension = model._vecs.shape[1]
 n_max = len(products)
 j = 0
 fwrite('Going through products to extract text embeddings... \n')
 for line in csv_file:
     j += 1
     if not j % 100000:
         fwrite('\t%d\n' % j)
         sys.stdout.flush()
     L = line.lower().split(';')
     idx = L[0]
     if idx in products:
         raw_product = dict([(k,v.decode('utf-8')) for k,v in zip(all_keys, L)])
         product = dict([(k,word_tokenize(raw_product[k])) for k in product_keys])
         vecs = text_embedding(product)
         products[idx]["text_emb"] = vecs
         products[idx]['product'] = json.dumps(raw_product)    
     
     
 fwrite('Done\n')
 images_path = 'images/img/training'
 break_all = False
 K = 0
 fwrite('Retrieving image paths... ')
 for (dirpath, dirnames, filenames) in walk(images_path):
     for f in filenames:
         idx = f.split('.')[0]
         if idx in products:
             products[idx]['image_path'] = join(dirpath,f)
if params.output:
	f = open(params.output, 'w')
	myprint = lambda x: f.write(str(x)+'\n')
else:
	f = None
	myprint = print
eprint = lambda x: print(x, file = sys.stderr)

with open(params.input) as fin:
	sents = dict()
	for line in fin:
		l = line.rstrip('\n ')
		if l not in sents:
			sents[l] = 0
		sents[l]+=1

i=1
for sent in sents:
	eprint('Sentence {} of {}'.format(i, len(sents)))
	i+=1
	words = tokenizer.word_tokenize(sent)
	lines = lv.fst_alter_sent(words, params.num)
	for j in range(sents[sent]):
		for x in lines:
			myprint(x[2])

eprint('Output file should be randomly shuffled before used.')

if f:
	f.close()
            tokenizerCount += 1
    return (nltkCount, tokenizerCount)


print("Size of Brown corpus in bytes:  ", file_size("brown.txt"))
text = open("brown.txt", "r").read()  #Read the Brown corpus

t0 = time()
nltkTokens = tokenize(text)  #Tokenizer with NLTK
t1 = time()
nltkTime = t1 - t0
print("Time taken by NLTK's word_tokenize to tokenize text: ", nltkTime)
print("Number of tokens generated by NLTK's word_tokenize: ", len(nltkTokens))

t2 = time()
tokenizerTokens = word_tokenize(text)  #Tokenizer with tokenizer
t3 = time()
tokenizerTime = t3 - t2
print("Time taken by tokenizer's word_tokenize to tokenize text: ",
      tokenizerTime)
print("Number of tokens generated by tokenizer's word_tokenize: ",
      len([j for i in tokenizerTokens for j in i]))

functionSpeed = speed(nltkTime, tokenizerTime)
print(functionSpeed[0], "is faster than", functionSpeed[1], "by",
      functionSpeed[2], "seconds")

numberOfTokens = num_tokens(nltkTokens, tokenizerTokens)
print(numberOfTokens[0], "generated", numberOfTokens[1], "more tokens than",
      numberOfTokens[2])
Exemple #23
0
    def baseline_tagger(self):

        from nltk.corpus import brown
        from nltk.tag import TrigramTagger

        print("Number of words in Brown corpus: 1333212")
        print("Number of unique tags in Brown corpus: 474")

        f = open("input.txt", "r").read()

        file_info = stat("input.txt")

        print("Size of test file: ", file_info.st_size)

        sents_tokens = word_tokenize(f)
        print("Number of tags to be tokenized: ",
              len([j for i in sents_tokens for j in i]))

        t0 = time()
        tagger = TrigramTagger(brown.tagged_sents()[:55000])
        t1 = time()
        nltk_train_time = t1 - t0
        print("Time taken by NLTK for training: ", nltk_train_time)

        nltk_tags = []
        t0 = time()
        for sent in sents_tokens:
            nltk_tags.append(tagger.tag(sent))
        t1 = time()
        nltk_tag_time = t1 - t0
        print("Time taken by NLTK to tag text: ", nltk_tag_time)

        t0 = time()
        self.tokenize()
        self.init_tags()
        self.init_words_tags()
        self.init_dict()
        self.calc_Q()
        self.calc_R()
        t1 = time()
        pos_train_time = t1 - t0

        print("Time taken by pos_tagger to train: ", pos_train_time)

        pos_tagger_tags = []
        t0 = time()
        for sent in sents_tokens:
            pos_tagger_tags.append(self.viterbi(sent))
        t1 = time()
        pos_tag_time = t1 - t0
        print("Time taken by pos_tagger to tag: ", pos_tag_time)

        if nltk_train_time < pos_train_time:
            print("Training time of NLTK is less than pos_tagger by: ",
                  abs(nltk_train_time - pos_train_time))
        else:
            print("Training time of pos_tagger is less than NLTK by: ",
                  abs(nltk_train_time - pos_train_time))

        if nltk_tag_time < pos_tag_time:
            print("Tagging time of NLTK is less than pos_tagger by: ",
                  abs(nltk_tag_time - pos_tag_time))
        else:
            print("Tagging time of pos_tagger is less than NLTK by: ",
                  abs(nltk_tag_time - pos_tag_time))

        nltk_tag_count = defaultdict(int)
        for i in nltk_tags:
            for j in i:
                nltk_tag_count[j[1]] += 1

        pos_tag_count = defaultdict(int)
        for i in pos_tagger_tags:
            for j in i:
                pos_tag_count[j[1]] += 1

        print("POS tags generated by NLTK: ")
        for i in nltk_tag_count.items():
            print(i)

        print("POS tags generated by pos_tagger: ")
        for i in pos_tag_count.items():
            print(i)

        print("Number of unique tags generated by NLTK: ",
              len([i for i in nltk_tag_count.keys()]))

        print("Number of unique tags generated by pos_tagger: ",
              len([i for i in pos_tag_count.keys()]))

        print("NLTK failed to tag", nltk_tag_count[None], "tokens")

        print("pos_tagger failed to tag", pos_tag_count[''], "tokens")

        if nltk_tag_count[None] > pos_tag_count['']:
            print("pos_tagger tagged",
                  abs(nltk_tag_count[None] - pos_tag_count['']),
                  "more tokens than NLTK")
        else:
            print("NLTK tagged", abs(nltk_tag_count[None] - pos_tag_count['']),
                  "more tokens than pos_tagger")

        tagged_sents = open("input_tagged.txt", "r").read().splitlines()
        tags = []
        for sent in tagged_sents:
            words = sent.split()
            for word in words:
                m = re.search('(.*)_(.*)', word)
                tags.append(m.group(2))

        n_tags = [j[1] for i in nltk_tags for j in i]
        nltk_count = 0
        for x, y in zip(n_tags, tags):
            if x == y:
                nltk_count += 1

        len_tokens = len([j for i in sents_tokens for j in i])

        print("NLTK accurately tagged", nltk_count, "tokens")
        print("NLTK accuracy score: ", float(nltk_count) / float(len_tokens))

        p_tags = [j[1] for i in pos_tagger_tags for j in i]
        pos_count = 0
        for x, y in zip(p_tags, tags):
            if x == y:
                pos_count += 1

        print("pos_tagger accurately tagged", pos_count, "tokens")
        print("pos_tagger accuracy score: ",
              float(pos_count) / float(len_tokens))

        if nltk_count > pos_count:
            print("NLTK accurately tagged", abs(nltk_count - pos_count),
                  "more tokens than pos_tagger")
        else:
            print("pos_tagger accurately tagged", abs(nltk_count - pos_count),
                  "more tokens than NLTK")
Exemple #24
0
 dim_img = d['image_emb'].shape[0]
 dim_multi = 150
 
 image_embeddings = []
 product_ids = []
 fwrite('Loading data ...\n')
 product_ids = data.keys()
 tf = []
 df = Counter()
 for idx in product_ids:
     d = data.pop(idx)
     if not len(data)%5000:
         fwrite('%d\n' % len(data))
     image_embeddings.append(d['image_emb'].astype(dtype))
     
     tokenized = word_tokenize(json.loads(d['product'])['Description'])
     tfs = {}
     for w,c in Counter(tokenized).iteritems(): 
         tfs[w] = float(c) / len(tokenized)
     for w in set(tokenized):
         df[w] += 1
     tf.append(tfs)
 del data
 
 D = len(product_ids)
 idfs = dict((k, np.log(float(D)/df[k])) for k in df if df[k]>5)
 del df
 vocab = idfs.keys()
 vocab_dict = dict((k,v) for v,k in enumerate(vocab))
 vocab_size = len(vocab)
 dim_txt = vocab_size