Ejemplo n.º 1
0
def read_semeval(binary=False):
    # ---- semeval
    all_msgs = []
    for fname in [
            "semeval_train_complete.txt", "Twitter2013_raw.txt",
            "Twitter2014_raw.txt", "Twitter2015_raw.txt"
    ]:
        msgs = []
        with codecs.open(DATA_IN + "semeval/%s" % fname, "r", "utf-8") as fid:
            for l in fid:
                spt = l.replace("\n", "").split("\t")
                label = spt[0].replace("\"", "")
                if label == "objective-OR-neutral":
                    label = "neutral"
                if (binary and label not in ["positive","negative"]) \
                         or label not in ["positive","neutral","negative"] :
                    continue
                tweet = spt[1]
                tweet = preprocess(tweet)
                ex = (label, tweet)
                msgs.append(ex)
        shuffle(msgs)
        all_msgs += msgs
        with codecs.open(DATA_OUT + fname.lower(), "w", "utf-8") as fod:
            for ex in msgs:
                fod.write('\t'.join(ex) + "\n")

    return all_msgs
Ejemplo n.º 2
0
def read_hcr(binary=False):
    # ---- HCR
    all_msgs = []
    for f in ["dev.xml", "train.xml", "test.xml"]:
        msgs = []
        with open(DATA_IN + "hcr/%s" % f) as fid:
            soup = BeautifulSoup(fid.read(), "xml")
            for item in soup.findAll('item'):
                if (binary and item.attrs['label'] not in ["positive","negative"]) \
                         or item.attrs['label'] not in ["positive","neutral","negative"] :
                    continue
                msg = item.find("content").text
                msg = preprocess(msg.decode("utf-8"))
                ex = (item.attrs['label'], msg)
                msgs.append(ex)
            shuffle(msgs)
        all_msgs += msgs
        fname = "hcr_%s.txt" % f.replace(".xml", "")
        with open(DATA_OUT + fname, "w") as fod:
            for ex in msgs:
                fod.write('\t'.join(ex) + "\n")

    return all_msgs
Ejemplo n.º 3
0
def inference(text):
    text[0] = preprocess(text[0])
    device = torch.device("cpu")
    hidden_size = 256
    model = MyModel0(len(VOCAB), 16, hidden_size).to(device)
    model.load_state_dict(
        torch.load("model.pth", map_location=torch.device('cpu')))

    #text = ["shubham bisht, something happens"]
    text_tensor = torch.zeros(len(text[0]), 1, dtype=torch.long)
    text_tensor[:,
                0] = torch.LongTensor([VOCAB.find(c) for c in text[0].upper()])
    #print(text_tensor)
    inp = text_tensor.to(device)

    oupt = model(inp)
    prob = torch.nn.functional.softmax(oupt, dim=2)
    prob, pred = torch.max(prob, dim=2)

    color_print(text[0], pred)
    json = pred_to_dict(text[0], pred, prob)
    print("\n###########################\n")
    print(json)
    return json
Ejemplo n.º 4
0

if __name__ == "__main__":
    parser = get_parser()
    args   = parser.parse_args()    
    idz = []
    print "Preprocess Data"
    with codecs.open(args.out_txt,"w","utf-8") as fod:
        with codecs.open(args.input,"r","utf-8") as fid:
            msgs = []                        
            for line in fid:                
                clean_line = re.sub('[\n\r\'\"]', '', line)
                clean_line = clean_line.replace("#sarcasm", "").replace("#sarcastic", "")                
                st = clean_line.split("\t")
                if len(st) != 4: set_trace()
                tweet_id, user, label, m = st
                idz.append(int(tweet_id)) 
                m = ut.preprocess(m, sep_emoji=True)
                fod.write(u"%s\t%s\t%s\t%s\n" % (tweet_id,user,label,m))
                msgs.append(m)
    #compute word index
    wrd2idx = ut.word_2_idx(msgs)    
    print "Load Word Embeddings"
    emb_utils.save_embeddings_txt(args.word_vectors, args.out_vectors, wrd2idx)
    # pre-compute the crossvalidation folds so that different models 
    # can be compared on the same data splits
    build_folds(idz)



Ejemplo n.º 5
0
# import MeCab

# tagger = MeCab.Tagger("-Owakati")

# result = tagger.parse("I have a pen. You have a dance.")

# print(result.split())

from my_utils import preprocess

# window_size = 1
# hidden_size = 5     # 中間層のサイズ(単語ベクトルの次元数)

japanese_text = "梅雨で雨の日が多いですね。早く梅雨が明けて欲しいですね。"
corpus, word_to_id, id_to_word = preprocess(japanese_text)
print(corpus)
print(word_to_id)
print(id_to_word)

model = SimpleCBOW()
Ejemplo n.º 6
0
    )
    return parser


if __name__ == "__main__":
    parser = get_parser()
    args = parser.parse_args()
    idz = []
    print "Preprocess Data"
    with codecs.open(args.out_txt, "w", "utf-8") as fod:
        with codecs.open(args.input, "r", "utf-8") as fid:
            msgs = []
            for line in fid:
                clean_line = re.sub('[\n\r\'\"]', '', line)
                clean_line = clean_line.replace("#sarcasm",
                                                "").replace("#sarcastic", "")
                st = clean_line.split("\t")
                if len(st) != 4: set_trace()
                tweet_id, user, label, m = st
                idz.append(int(tweet_id))
                m = ut.preprocess(m, sep_emoji=True)
                fod.write(u"%s\t%s\t%s\t%s\n" % (tweet_id, user, label, m))
                msgs.append(m)
    #compute word index
    wrd2idx = ut.word_2_idx(msgs)
    print "Load Word Embeddings"
    emb_utils.save_embeddings_txt(args.word_vectors, args.out_vectors, wrd2idx)
    # pre-compute the crossvalidation folds so that different models
    # can be compared on the same data splits
    build_folds(idz)