def determine_if_video_media_from_reddit_json(data):
    #reads the reddit json and determines if link is a video
    is_a_video = False

    media_url = clean_str(data, ['media', 'oembed', 'url'], '')
    if media_url == '':
        media_url = clean_str(data, ['url'])

    # also check  "post_hint" : "rich:video"

    media_url = media_url.split('?')[0]  #get rid of the query string
    try:
        zzz = data['media']['oembed']['type']
        #log("    zzz"+str(idx)+"="+str(zzz))
        if zzz == None:  #usually, entry['data']['media'] is null for not videos but it is also null for gifv especially nsfw
            if ".gifv" in media_url.lower():  #special case for imgur
                is_a_video = True
            else:
                is_a_video = False
        elif zzz == 'video':
            is_a_video = True
        else:
            is_a_video = False
    except (KeyError, TypeError, AttributeError):
        is_a_video = False

    return is_a_video
def determine_if_video_media_from_reddit_json( data ):
    from utils import clean_str

    is_a_video=False

    media_url=clean_str(data,['media','oembed','url'],'')
    if media_url=='':
        media_url=clean_str(data,['url'])


    media_url=media_url.split('?')[0] #get rid of the query string
    try:
        zzz = data['media']['oembed']['type']

        if zzz == None:   #usually, entry['data']['media'] is null for not videos but it is also null for gifv especially nsfw
            if ".gifv" in media_url.lower():  #special case for imgur
                is_a_video=True
            else:
                is_a_video=False
        elif zzz == 'video':
            is_a_video=True
        else:
            is_a_video=False
    except (KeyError,TypeError,AttributeError):
        is_a_video=False

    return is_a_video
def determine_if_video_media_from_reddit_json( data ):
    #reads the reddit json and determines if link is a video
    is_a_video=False

    media_url=clean_str(data,['media','oembed','url'],'')
    if media_url=='':
        media_url=clean_str(data,['url'])

    # also check  "post_hint" : "rich:video"

    media_url=media_url.split('?')[0] #get rid of the query string
    try:
        zzz = data['media']['oembed']['type']
        #log("    zzz"+str(idx)+"="+str(zzz))
        if zzz == None:   #usually, entry['data']['media'] is null for not videos but it is also null for gifv especially nsfw
            if ".gifv" in media_url.lower():  #special case for imgur
                is_a_video=True
            else:
                is_a_video=False
        elif zzz == 'video':
            is_a_video=True
        else:
            is_a_video=False
    except (KeyError,TypeError,AttributeError):
        is_a_video=False

    return is_a_video
Esempio n. 4
0
def write_sent_data(docs, fdir, balance_classes=False):
    group_docs = defaultdict(list)
    for doc in docs:
        group_docs[doc.group].append(doc)
    for group, doc_list in group_docs.items():
        fout = open('{}/{}.tsv'.format(fdir, group), 'w')
        for doc in doc_list:
            sent_labels = [
                any([utils.s_overlap(s, f.ev) for f in doc.frames])
                for s in doc.sents
            ]
            pos_sents = [s.text for s, l in zip(doc.sents, sent_labels) if l]
            neg_sents = [
                s.text for s, l in zip(doc.sents, sent_labels) if not l
            ]
            if balance_classes:
                neg_samples = []
                for pos_s in pos_sents:
                    neg_sents = sorted(neg_sents,
                                       key=lambda s: abs(len(s) - len(pos_s)))
                    try:
                        neg_samples.append(neg_sents.pop(0))
                        neg_samples.append(neg_sents.pop(0))
                    except IndexError:
                        print(
                            'Warning: unable to sample enough negatives from doc {}'
                            .format(doc.id))
                neg_sents = neg_samples
            for s in pos_sents:
                fout.write('1\t{}\t{}\n'.format(doc.id, utils.clean_str(s)))
            for s in neg_sents:
                fout.write('0\t{}\t{}\n'.format(doc.id, utils.clean_str(s)))
def r_linkHunter(json_node,d=0):
    from utils import clean_str
    #recursive function to harvest stuff from the reddit comments json reply
    prog = re.compile('<a href=[\'"]?([^\'" >]+)[\'"]>(.*?)</a>')
    for e in json_node:
        link_desc=""
        link_http=""
        author=""
        created_utc=""
        e_data=e.get('data')
        score=e_data.get('score',0)
        if e['kind']=='t1':     #'t1' for comments   'more' for more comments (not supported)
            #log("replyid:"+str(d)+" "+e['data']['id'])
            #body=e['data']['body'].encode('utf-8')

            #log("reply:"+str(d)+" "+body.replace('\n','')[0:80])
            try: replies=e_data.get('replies')['data']['children']
            except (AttributeError,TypeError): replies=""

            post_text=clean_str(e_data,['body'])
            post_text=post_text.replace("\n\n","\n")

            post_html=clean_str(e_data,['body_html'])

            created_utc=e_data.get('created_utc','')

            author=clean_str(e_data,['author'])

            #i initially tried to search for [link description](https:www.yhotuve.com/...) in the post_text but some posts do not follow this convention
            #prog = re.compile('\[(.*?)\]\((https?:\/\/.*?)\)')
            #result = prog.findall(post_text)

            result = prog.findall(post_html)
            if result:
                #store the post by itself and then a separate one for each link.
                harvest.append((score, link_desc, link_http, post_text, post_html, d, "t1",author,created_utc,)   )

                for link_http,link_desc in result:
                    harvest.append((score, link_desc, link_http, link_desc, post_html, d, "t1",author,created_utc,)   )
            else:
                harvest.append((score, link_desc, link_http, post_text, post_html, d, "t1",author,created_utc,)   )

            d+=1 #d tells us how deep is the comment in
            r_linkHunter(replies,d)
            d-=1

        if e['kind']=='t3':     #'t3' for post text (a description of the post)
            self_text=clean_str(e_data,['selftext'])
            self_text_html=clean_str(e_data,['selftext_html'])

            result = prog.findall(self_text_html)
            if len(result) > 0 :
                harvest.append((score, link_desc, link_http, self_text, self_text_html, d, "t3",author,created_utc, )   )

                for link_http,link_desc in result:
                    harvest.append((score, link_desc, link_http, link_desc, self_text_html, d, "t3",author,created_utc, )   )
            else:
                if len(self_text) > 0: #don't post an empty titles
                    harvest.append((score, link_desc, link_http, self_text, self_text_html, d, "t3",author,created_utc,)   )
Esempio n. 6
0
def write_o_ev_data_pipeline(docs, fdir):
    fout = open('{}/{}.tsv'.format(fdir, docs[0].group), 'w')
    for doc in docs:
        assert doc.group == 'test' or doc.group == 'testtest'
        for ev_span in doc.labels['BERT_ev']:
            for o_span in utils.s_overlaps(ev_span, doc.labels['NER_o']):
                fout.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format('0', doc.id, o_span.i, o_span.f, \
                  utils.clean_str(o_span.text), utils.clean_str(ev_span.text)))
Esempio n. 7
0
    def load_train_val_dataset(self, questions, duplicates, label):
        num_positive_samples = 145000
        num_negative_samples = 145000
        tr_idx = 250000
        val_idx = 290000

        # sample positive and negative labels
        positive_labels = []
        negative_labels = []
        for index, value in enumerate(label):
            if value == 1:
                positive_labels.append(index)
            else:
                negative_labels.append(index)
        positive_selection = random.sample(positive_labels,
                                           num_positive_samples)
        negative_selection = random.sample(negative_labels,
                                           num_negative_samples)
        selection = positive_selection + negative_selection
        random.shuffle(selection)

        #sample questions based on selected labels
        questions = [questions[i] for i in selection]
        duplicates = [duplicates[i] for i in selection]
        label = [label[i] for i in selection]

        questions = [utils.clean_str(sentence) for sentence in questions]
        questions_seq = self.tokenizer.texts_to_sequences(questions)
        questions_data = pad_sequences(questions_seq,
                                       self.opts.max_sequence_len)

        tr_input_one = questions_data[0:tr_idx]
        val_input_one = questions_data[tr_idx:val_idx]

        duplicates = [utils.clean_str(sentence) for sentence in duplicates]
        duplicates_seq = self.tokenizer.texts_to_sequences(duplicates)
        duplicates_data = pad_sequences(duplicates_seq,
                                        self.opts.max_sequence_len)
        tr_input_two = duplicates_data[0:tr_idx]
        val_input_two = duplicates_data[tr_idx:val_idx]

        target = label[0:tr_idx]
        target_val = label[tr_idx:val_idx]
        # convert labels to one-hot encoding
        tr_label = np_utils.to_categorical(target)
        val_label = np_utils.to_categorical(target_val)

        dataset = {
            'tr_input_one': tr_input_one,
            'tr_input_two': tr_input_two,
            'tr_label': tr_label,
            'val_input_one': val_input_one,
            'val_input_two': val_input_two,
            'val_label': val_label
        }
        return dataset
Esempio n. 8
0
def get_clean_words(docs):
    clean_words = []
    for doc in docs:
        if args.dataset != "mr":
            temp = clean_str(doc).split()
            temp = list(filter(lambda x : x not in stop_words, temp))
        else:
            temp = clean_str(doc).split()
        clean_words.append(temp)
    return clean_words
Esempio n. 9
0
def process(eval_tuple, baseline=False):
    dbr = clean_str(eval_tuple['dbr'])
    question = clean_str(eval_tuple['question'])
    match_fn = simple_match if baseline else match_by_trigrams
    mention = match_fn(dbr, question)
    # mention = merge(dbr, match_by_trigrams(dbr, question), simple_match(dbr, question))
    if mention is None:
        mention = ''
    output = eval_tuple.copy()
    output['mention'] = mention
    return output
Esempio n. 10
0
def r_linkHunter(json_node,d=0):
    from utils import clean_str

    prog = re.compile('<a href=[\'"]?([^\'" >]+)[\'"]>(.*?)</a>')
    for e in json_node:
        link_desc=""
        link_http=""
        author=""
        created_utc=""
        e_data=e.get('data')
        score=e_data.get('score',0)
        if e['kind']=='t1':     #'t1' for comments   'more' for more comments (not supported)

            try: replies=e_data.get('replies')['data']['children']
            except (AttributeError,TypeError): replies=""

            post_text=clean_str(e_data,['body'])
            post_text=post_text.replace("\n\n","\n")

            post_html=clean_str(e_data,['body_html'])

            created_utc=e_data.get('created_utc','')

            author=clean_str(e_data,['author'])


            result = prog.findall(post_html)
            if result:

                harvest.append((score, link_desc, link_http, post_text, post_html, d, "t1",author,created_utc,)   )

                for link_http,link_desc in result:
                    harvest.append((score, link_desc, link_http, link_desc, post_html, d, "t1",author,created_utc,)   )
            else:
                harvest.append((score, link_desc, link_http, post_text, post_html, d, "t1",author,created_utc,)   )

            d+=1 #d tells us how deep is the comment in
            r_linkHunter(replies,d)
            d-=1

        if e['kind']=='t3':     #'t3' for post text (a description of the post)
            self_text=clean_str(e_data,['selftext'])
            self_text_html=clean_str(e_data,['selftext_html'])

            result = prog.findall(self_text_html)
            if len(result) > 0 :
                harvest.append((score, link_desc, link_http, self_text, self_text_html, d, "t3",author,created_utc, )   )

                for link_http,link_desc in result:
                    harvest.append((score, link_desc, link_http, link_desc, self_text_html, d, "t3",author,created_utc, )   )
            else:
                if len(self_text) > 0: #don't post an empty titles
                    harvest.append((score, link_desc, link_http, self_text, self_text_html, d, "t3",author,created_utc,)   )
def create_mr():
    pos = read_file('raw_datasets/rt-polarity.pos')
    neg = read_file('raw_datasets/rt-polarity.neg')

    # build matrices
    X, y = [], []
    for sent in pos:
        X.append(clean_str(sent))
        y.append([0,1])
    for sent in neg:
        X.append(clean_str(sent))
        y.append([1,0])
    
    # build vocab
    mr_vocab = Vocabulary(X)
    print('vocab', len(mr_vocab.vocab))

    # encode sents
    max_seq_len = compute_avg_len(X)
    for i in range(len(X)):
        X[i] = encode_sent(X[i].split(' '), mr_vocab.encoding, max_seq_len)

    # build embeddings
    embeddings = []
    for name, (emb_vocab, emb_vectors) in embeddings_map.items():
        embedding, found = create_embeddings(
            mr_vocab, emb_vocab, emb_vectors, 300
        )
        embeddings.append(embedding)
        print('{} - {}'.format(name, found))
    w2v_embeddings, glove_embeddings, nb_embeddings = embeddings

    # shuffle
    X, y = np.array(X), np.array(y)
    indices = np.random.permutation(len(X))
    X, y = X[indices], y[indices]

    split_idx = int(len(X) * 0.9)
    X_train, X_valid = X[:split_idx], X[split_idx:]
    y_train, y_valid = y[:split_idx], y[split_idx:]

    print('train', X_train.shape, y_train.shape)
    print('valid', X_valid.shape, y_valid.shape)

    # save objects
    save_object('datasets/mr_train', (X_train, y_train))
    save_object('datasets/mr_valid', (X_valid, y_valid))
    save_object('datasets/mr_vocab', mr_vocab)
    save_object('datasets/mr_w2v_embs', w2v_embeddings)
    save_object('datasets/mr_glove_embs', glove_embeddings)
    save_object('datasets/mr_nb_embs', nb_embeddings)
Esempio n. 12
0
def write_i_c_o_data(docs, fdir):
    group_docs = defaultdict(list)
    for doc in docs:
        group_docs[doc.group].append(doc)
    for group, doc_list in group_docs.items():
        fout = open('{}/{}.tsv'.format(fdir, group), 'w')
        for doc in doc_list:
            for frame in doc.frames:
                i_text = utils.clean_str(frame.i.text)
                c_text = utils.clean_str(frame.c.text)
                o_text = utils.clean_str(frame.o.text)
                s1 = '{} vs. {}'.format(i_text, c_text)
                s2 = '{}'.format(o_text)
                fout.write('{}\t{}\t{}\t{}\n'.format(frame.label + 1, doc.id,
                                                     s1, s2))
Esempio n. 13
0
def main(args):

    device = flow.device("cpu") if args.no_cuda else flow.device("cuda")
    with open(args.config_path, "r") as f:
        config = json.load(f)
    with open(args.vocab_path, "rb") as f:
        vocab = pickle.load(f)
    textcnn = textCNN(
        word_emb_dim=config["word_emb_dim"],
        vocab_size=len(vocab),
        dim_channel=config["dim_channel"],
        kernel_wins=config["kernel_wins"],
        dropout_rate=config["dropout_rate"],
        num_class=config["num_class"],
        max_seq_len=config["max_seq_len"],
    )
    textcnn.load_state_dict(flow.load(args.model_path))
    textcnn.eval()
    textcnn.to(device)
    text = utils.clean_str(args.text)
    text = [utils.tokenizer(text)]
    input = flow.tensor(np.array(utils.tensorize_data(text, vocab,
                                                      max_len=200)),
                        dtype=flow.long).to(device)
    predictions = textcnn(input).softmax()
    predictions = predictions.numpy()
    clsidx = np.argmax(predictions)
    print("predict prob: %f, class name: %s" % (np.max(predictions), clsidx))
Esempio n. 14
0
def fileToMat(filename, w2vec, maxLen, label_set, train=True):
    kkma = Kkma()
    train_f = open(filename, 'r', encoding='utf-8')
    mat = []
    line_num = 0
    for line in train_f.read().splitlines():
        sen = {}
        line_splitted = line.split('\t')
        sbj = line_splitted[0].strip()
        obj = line_splitted[1].strip()
        relation = line_splitted[2].strip()
        sentence = line_splitted[3].strip()
        sentence_complete = re.sub('<< _obj_ >>', obj,
                                   re.sub('<< _sbj_ >>', sbj, sentence))
        sentence_complete = utils.clean_str(sentence_complete)
        tokens = [
            p[0] + '/' + p[1] for p in kkma.pos(sentence_complete)
            if p[0] + '/' + p[1] in w2vec.vocab
        ]
        if maxLen < len(tokens):
            if train:
                maxLen = len(tokens)
            else:
                tokens = tokens[:maxLen]
        label_set.add(relation)
        sen['sbj'] = sbj
        sen['obj'] = obj
        sen['relation'] = relation
        sen['sentence'] = sentence
        sen['tokens'] = tokens
        mat.append(sen)
        line_num += 1
    train_f.close()
    return mat, label_set, maxLen, line_num
Esempio n. 15
0
def parse_sstb(dir, binary=False):
    '''
    Hardcoded solution to process SSTB due to its unique formatting
    :param dir: SSTB directory
    :return: dictionary containing phrase/label combinations
    '''
    dict = open(dir + 'dictionary.txt', 'r').read()
    labels = open(dir + 'sentiment_labels.txt', 'r').read()

    tuples = [sub.split('|') for sub in dict.split('\n')]
    phrase_id = {clean_str(x[0]): int(x[1]) for x in tuples if len(x) > 1}
    tuples2 = [sub.split('|') for sub in labels.split('\n')][1:]
    id_label = {int(x[0]): float(x[1]) for x in tuples2 if len(x) > 1}

    phrase_label = {
        x: transform_label(id_label[phrase_id[x]])
        for x in phrase_id.keys()
    }

    # Binarize
    if binary:
        phrase_label = {
            x: binarize(y)
            for (x, y) in phrase_label.items() if y != 2
        }

    return phrase_label
    def user_request(self, tokenizer, text, opts, faq_size):
        '''
		user_request retrieves user query
		modify as fit - make sure each user question is repeated n times
		(where n= len(faqs)). the idea is to compare the user request 
		to every single faq
		@argu
			tokenizer: saved tokenizer object
			opts: configuration params
			faq_size: len of the loaded faq list
		@returns 
			user_input_list_data: list with repeated user input 
			user_input: question asked by user  to be matched to known faqs
		'''

        user_input = text

        # note: make a list of repeated user input same size as faq
        user_input_list = [user_input for i in range(faq_size)]
        user_input_list = [
            utils.clean_str(sentence) for sentence in user_input_list
        ]
        user_input_list_seq = tokenizer.texts_to_sequences(user_input_list)
        user_input_list_data = pad_sequences(user_input_list_seq,
                                             opts.max_sequence_len)
        return user_input_list_data, user_input
Esempio n. 17
0
def write_i_c_data_pipeline(docs, context_fn, fdir):
    group_docs = defaultdict(list)
    for doc in docs:
        group_docs[doc.group].append(doc)
    for group, doc_list in group_docs.items():
        fout = open('{}/{}.tsv'.format(fdir, group), 'w')
        for doc in doc_list:
            for ev in doc.labels['BERT_ev']:
                visited_frames = {}
                for i in doc.labels['NER_i']:
                    k = i.text
                    if k not in visited_frames:
                        context_text = utils.clean_str(context_fn(doc, i, ev))
                        fout.write('0\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( \
                          doc.id, ev.i, ev.f, i.i, i.f, utils.clean_str(i.text), context_text))
                        visited_frames[k] = True
Esempio n. 18
0
def autoPlay(url, name, autoPlay_type):
    import random
    from domains import sitesBase, parse_reddit_link, build_DirectoryItem_url_based_on_media_type
    from utils import unescape, post_is_filtered_out, log, clean_str
    from actions import setting_gif_repeat_count
    from reddit import reddit_request, determine_if_video_media_from_reddit_json


    gif_repeat_count=setting_gif_repeat_count()

    entries = []
    playlist = xbmc.PlayList(xbmc.PLAYLIST_VIDEO)
    playlist.clear()
    log("**********autoPlay %s*************" %autoPlay_type)
    content = reddit_request(url)
    if not content: return

    content = json.loads(content.replace('\\"', '\''))

    log("Autoplay %s - Parsing %d items" %( autoPlay_type, len(content['data']['children']) )    )

    for j_entry in content['data']['children']:
        try:
            if post_is_filtered_out( j_entry ):
                continue

            title = clean_str(j_entry, ['data','title'])

            try:
                media_url = j_entry['data']['url']
            except:
                media_url = j_entry['data']['media']['oembed']['url']

            is_a_video = determine_if_video_media_from_reddit_json(j_entry)

            ld=parse_reddit_link(link_url=media_url, assume_is_video=is_a_video, needs_preview=False, get_playable_url=True )

            DirectoryItem_url, setProperty_IsPlayable, isFolder, title_prefix = build_DirectoryItem_url_based_on_media_type(ld, media_url, title, on_autoplay=True)

            if ld:
                if ld.media_type not in [sitesBase.TYPE_VIDEO, sitesBase.TYPE_GIF, sitesBase.TYPE_VIDS, sitesBase.TYPE_MIXED]:
                    continue

            autoPlay_type_entries_append( entries, autoPlay_type, title, DirectoryItem_url)
            if ld.media_type == sitesBase.TYPE_GIF:
                for _ in range( 0, gif_repeat_count ):
                    autoPlay_type_entries_append( entries, autoPlay_type, title, DirectoryItem_url)

        except Exception as e:
            log("  EXCEPTION Autoplay "+ str( sys.exc_info()[0]) + "  " + str(e) )


    if autoplayRandomize:
        random.shuffle(entries)

    for title, url in entries:
        listitem = xbmcgui.ListItem(title)
        playlist.add(url, listitem)
        log('add to playlist: %s %s' %(title.ljust(25)[:25],url ))
    xbmc.Player().play(playlist)
Esempio n. 19
0
def parse_comments(sents: Sequence[str], table_name):
    timestamps = [None]
    texts = []

    text = []
    for sent in sents:
        sent = clean_str(sent)
        date_strings = DATE_REGEX[table_name].findall(sent)
        if len(date_strings) == 0:
            text.append(sent)
        else:
            for date_string in date_strings:
                # print(date_string)
                split = sent.split(date_string)
                text.append(split[0])
                if len(split) > 1:
                    texts.append(' '.join(text))
                    text = []
                    sent = split[1]
                timestamps.append(
                    datetime.strptime(date_string, DATE_STR[table_name]))
            text.append(sent)
    texts.append(' '.join(text))

    if len(timestamps) > 1:
        # approximate the first text to be written around the time of the first comment.
        timestamps[0] = min(timestamps[1:])

    return timestamps, texts
Esempio n. 20
0
def cls_tf_idf(batch_lines):
    batch_x = [clean_str(sent) for sent in batch_lines]
    pred, attn = cls(batch_x)
    pred = np.argmax(pred.cpu().data.numpy(), axis=1)
    ret = []
    for line, x, pre, att in zip(batch_lines, batch_x, pred, attn):
        if len(x) > 0:
            att = att[:len(x)]
            if sys.argv[7] == 'yelp':
                avg = torch.mean(att)
            elif sys.argv[7] == 'amazon':
                avg = 0.4
            mask = att.gt(avg)
            if sum(mask).item() == 0:
                mask = torch.argmax(att).unsqueeze(0)
            else:
                mask = torch.nonzero(mask.squeeze()).squeeze(1)
            idx = mask.cpu().numpy()
            idx = [int(ix) for ix in idx]
            contents = []
            for i in range(0, len(x)):
                if i not in idx:
                    contents.append(x[i])
            wl = {
                "content": ' '.join(contents),
                "line": line.strip(),
                "masks": list(idx),
                "label": sys.argv[1][-1]
            }
            #print(wl)
            wl_str = json.dumps(wl)
            ret.append(wl_str)
    return ret
Esempio n. 21
0
def load_dict(path):
    dict = Dictionary()
    concepts =[]
    for line in codecs.open(path, 'r', 'utf-8'):
        line = line.strip()
        if line == '':
            continue

        concept = Concept()

        linesplit = line.split("\t")
        concept.meshId = linesplit[0].strip()

        name_synonym = list()
        for idx in range(1, len(linesplit)):
            names = linesplit[idx]

            names = clean_str(names)
            if opt.use_word2digit:
                names = wordToDigit(names)
            if names == '':
                continue
            name_synonym.append(names)
        concept.set_names(name_synonym)
        concepts.append(concept)

    dict.set_concepts(concepts)
    dict.set_id_to_names()
    return dict
Esempio n. 22
0
def write_o_ev_data(docs, fdir, add_i=False):
    group_docs = defaultdict(list)
    for doc in docs:
        group_docs[doc.group].append(doc)
    for group, doc_list in group_docs.items():
        fout = open('{}/{}.tsv'.format(fdir, group), 'w')
        for doc in doc_list:
            for frame in doc.frames:
                sents = utils.s_overlaps(frame.ev, doc.sents)
                ev_text = utils.clean_str(doc.text[sents[0].i:sents[-1].f])
                o_text = utils.clean_str(frame.o.text)
                if add_i:
                    o_text = '{} effect on {}'.format(
                        utils.clean_str(frame.i.text), o_text)
                fout.write('{}\t{}\t{}\t{}\n'.format(frame.label + 1, doc.id,
                                                     o_text, ev_text))
Esempio n. 23
0
def prod_num_of_sold_date():
    # obtain the parameters
    ## parameters with default values
    para = def_para()
    if (para is None): return help_prod_num_of_sold_date()
    prod_id = None if (
        request.args.get("prod_id") is None) else request.args.get("prod_id")
    time_range = None if (
        request.args.get("range") is None) else request.args.get("range")
    ## user given parameters
    start = utils.clean_str(str(request.args.get("start_date")), strip=True)
    end = utils.clean_str(str(request.args.get("end_date")), strip=True)
    # convert to datetime object
    try:
        start = datetime.strptime(start, "%Y-%m-%d")
        end = datetime.strptime(end, "%Y-%m-%d")
    except:
        return help_prod_num_of_sold_date(err="Invalid date!")
    # end date is before start date
    if (end < start):
        return help_prod_num_of_sold_date(
            err="End date is berfore start date!")
    start = start.strftime("%Y-%m-%d")
    end = end.strftime("%Y-%m-%d")
    if (start == end):
        start += " 00:00:00"
        end += " 23:59:59"

    # determine the grouping time range
    switcher = {
        "day": "DAY(timestamp)",  # group by day
        "week": "WEEK(timestamp)",  # group by week
        "month": "MONTH(timestamp)"  # group by month
    }
    if (time_range is not None):
        time_range = utils.clean_str(time_range.lower(), strip=True)
    time_filter = switcher.get(time_range, "DATE(timestamp)")
    # append the timestamp filter
    sql = sql_command.sql_num_of_sold_per_prod_by_date.format(
        f' timestamp BETWEEN "{start}" AND "{end}" ')
    # if a specific product id is given......
    if (prod_id is not None):
        prod_id = utils.clean_str(prod_id, strip=True)
        sql += f' WHERE Product.prod_id = "{prod_id}"'
    sql += f" GROUP BY Product.prod_id, {time_filter} ORDER BY {time_filter}"
    return gen_response(sql, para, helper_funct=help_prod_num_of_sold_date)
Esempio n. 24
0
def write_sent_data_pipeline(docs, fdir):
    group_docs = defaultdict(list)
    for doc in docs:
        group_docs[doc.group].append(doc)
    for group, doc_list in group_docs.items():
        fout = open('{}/{}.tsv'.format(fdir, group), 'w')
        for doc in doc_list:
            for sent in doc.sents:
                fout.write('0\t{}\t{}\n'.format(doc.id,
                                                utils.clean_str(sent.text)))
 def build_sst_matrices(lines):
     X, y = [], []
     for line in lines:
         words = line.split(' ')
         label = [0,0]
         label[int(line[0])] = 1
         sent = clean_str(line[1:])
         X.append(sent)
         y.append(label)
     return (X, y)
def load_or_build_embedding(ds, vocab):
    # One-hot embedding
    # embd = eye(len(vocab))
    # return embd

    # Read Word Vectors
    # word_vector_file = 'data/glove.6B/glove.6B.300d.txt'
    # word_vector_file = 'data/corpus/' + dataset + '_word_vectors.txt'
    #_, embd, word_vector_map = loadWord2Vec(word_vector_file)
    # word_embeddings_dim = len(embd[0])
    try:
        word_vector_file = 'data/corpus/' + ds + '_word_vectors.txt'
        word_vec_vocab, embd, word_vec_id_map = loadWord2Vec(word_vector_file)
        word_embeddings_dim = len(embd[0])

        # word embedding matrix
        wm = np.matrix(embd)
        return word_vec_vocab, wm, word_vec_id_map
    except:
        print('Building embedding...')
        definitions = []
        for word in vocab:
            word = word.strip()
            synsets = wn.synsets(clean_str(word))
            word_defs = []
            for synset in synsets:
                syn_def = synset.definition()
                word_defs.append(syn_def)
            word_des = ' '.join(word_defs)
            if word_des == '':
                word_des = '<PAD>'
            definitions.append(word_des)

        tfidf_vec = TfidfVectorizer(max_features=1000)
        tfidf_matrix = tfidf_vec.fit_transform(definitions)
        tfidf_matrix_array = tfidf_matrix.toarray()

        word_vectors = []

        for i in range(len(vocab)):
            word = vocab[i]
            vector = tfidf_matrix_array[i]
            str_vector = []
            for j in range(len(vector)):
                str_vector.append(str(vector[j]))
            temp = ' '.join(str_vector)
            word_vector = word + ' ' + temp
            word_vectors.append(word_vector)

        string = '\n'.join(word_vectors)
        f = open('data/corpus/' + ds + '_word_vectors.txt', 'w')
        f.write(string)
        f.close()

        return load_or_build_embedding(ds, vocab)
Esempio n. 27
0
def write_i_c_data(docs, context_fn, fdir, neg_prob=0.5):
    group_docs = defaultdict(list)
    for doc in docs:
        group_docs[doc.group].append(doc)
    for group, doc_list in group_docs.items():
        fout = open('{}/{}.tsv'.format(fdir, group), 'w')
        for doc in doc_list:
            visited_frames = {}
            for f in doc.frames:
                k = (f.i.text, f.c.text, f.ev.text)
                if k not in visited_frames:
                    context_text = utils.clean_str(context_fn(doc, f.i, f.ev))
                    fout.write('2\t{}\t{}\t{}\n'.format(
                        doc.id, utils.clean_str(f.i.text), context_text))
                    fout.write('1\t{}\t{}\t{}\n'.format(
                        doc.id, utils.clean_str(f.c.text), context_text))
                    visited_frames[k] = True
                    if random.random() <= neg_prob:
                        neg_i = get_neg_i(doc, f)
                        fout.write('0\t{}\t{}\t{}\n'.format(
                            doc.id, utils.clean_str(neg_i.text), context_text))
Esempio n. 28
0
def prod_show_by_id():
    # obtain the parameters
    ## parameters with default values
    para = def_para()
    if (para is None): return help_prod_show_by_id()
    ## user given parameters
    prod_id = request.args.get("prod_id")
    if (prod_id is None): return help_prod_show_by_id("No Product ID given!")
    prod_id = utils.clean_str(prod_id, strip=True)

    sql = sql_command.sql_product_by_id.format(prod_id)
    return gen_response(sql, para, helper_funct=help_prod_show_by_id)
Esempio n. 29
0
def parserNcbiTxtFile_simple(path):
    logging.info("loadData: {}".format(path))
    if opt.nlp_tool == "nltk":
        nlp_tool = nltk.data.load('tokenizers/punkt/english.pickle')

    documents = []
    id=title=abstractt = ""
    document = Document()
    for line in codecs.open(path, 'r', 'utf-8'):

        line = line.strip()

        if line != "":
            linesplits = line.split("|")
            if len(linesplits) == 3:
                if linesplits[1] == "t":
                    id = linesplits[0]
                    title = linesplits[2]
                if linesplits[1] == "a":
                    abstractt = linesplits[2]
            linesplitsEntity = line.split("\t")
            if len(linesplitsEntity) == 6:
                if linesplitsEntity[4] == 'Chemical':
                    continue
                meshId = linesplitsEntity[len(linesplitsEntity)-1]
                index = meshId.find(":")
                if index != -1:
                    meshId = meshId[index+1:]
                meshId = meshId.strip()
                entity = Entity()
                entitytext = clean_str(linesplitsEntity[3])
                if opt.use_word2digit:
                    entitytext = wordToDigit(entitytext)


                entity.setEntity(linesplitsEntity[0],int(linesplitsEntity[1]), int(linesplitsEntity[2]), entitytext,'Disease',meshId.strip())

                document.entities.append(entity)
        else:
            if len(id)>0 and len(title)>0 and len(abstractt)>0:
                document.initDocument(id, title, abstractt)
                # if id == '2234245':
                #     print(id)
                document_text = title + " " + abstractt
                sentences = get_sentences_and_tokens_from_nltk(document_text.lower(), nlp_tool, document.entities)
                document.sents = sentences
                document.initDocument(id, title, abstractt)

                documents.append(document)
                id = title = abstractt = ""
                document = Document()

    return documents
def create_mpqa():
    mpqa = read_file('raw_datasets/mpqa.all')

    # build matrices
    X, y = [], []
    for line in mpqa:
        words = line.split(' ')
        label = [0,0]
        label[int(line[0])] = 1
        sent = clean_str(line[1:])
        
        X.append(sent)
        y.append(label)

    # build vocab
    mpqa_vocab = Vocabulary(X)
    print('vocab', len(mpqa_vocab.vocab))

    # encode sents
    max_len = compute_avg_len(X) 
    for i in range(len(X)):
        X[i] = encode_sent(X[i].split(' '), mpqa_vocab.encoding, max_len)
    
    # build embeddings
    embeddings = []
    for name, (emb_vocab, emb_vectors) in embeddings_map.items():
        embedding, found = create_embeddings(
            mpqa_vocab, emb_vocab, emb_vectors, 300
        )
        embeddings.append(embedding)
        print('{} - {}'.format(name, found))
    w2v_embeddings, glove_embeddings, nb_embeddings = embeddings

    # shuffle
    X, y = np.array(X), np.array(y)
    indices = np.random.permutation(len(X))
    X, y = X[indices], y[indices]

    split_idx = int(len(X) * 0.9)
    X_train, X_valid = X[:split_idx], X[split_idx:]
    y_train, y_valid = y[:split_idx], y[split_idx:]

    print('train', X_train.shape, y_train.shape)
    print('valid', X_valid.shape, y_valid.shape)

    # save objects
    save_object('datasets/mpqa_train', (X_train, y_train))
    save_object('datasets/mpqa_valid', (X_valid, y_valid))
    save_object('datasets/mpqa_vocab', mpqa_vocab)
    save_object('datasets/mpqa_w2v_embs', w2v_embeddings)
    save_object('datasets/mpqa_glove_embs', glove_embeddings)
    save_object('datasets/mpqa_nb_embs', nb_embeddings)
Esempio n. 31
0
def cate_purchased_by_cust():
    # obtain the parameters
    ## parameters with default values
    para = def_para()
    if (para is None): return help_cate_purchased_by_cust()
    ## user given parameters
    cust_id = request.args.get('cust_id')
    if (cust_id is None):
        return help_cate_purchased_by_cust(err="Please enter the cust_id!")
    cust_id = utils.clean_str(str(cust_id), strip=True)

    sql = sql_command.sql_purchased_by_cust.format(cust_id)
    return gen_response(sql, para, helper_funct=help_cate_purchased_by_cust)
Esempio n. 32
0
def loadAbbreviations(abbrePath):
    abbreviations = list()
    lines = codecs.open(abbrePath, 'r', 'utf-8')
    for line in lines:
        line = line.strip().lower()
        if line=='':
            continue
        linesplits = line.split("\t")
        abbre = DiseaseAbbreviation()

        if len(linesplits) < 3:
            print(line)
        linesplits[1] = clean_str(linesplits[1])
        linesplits[2] = clean_str(linesplits[2])
        if opt.use_word2digit:
            linesplits[1] = wordToDigit(linesplits[1])
            linesplits[2] = wordToDigit(linesplits[2])

        abbre.initAbbre(linesplits[0].strip(), linesplits[1], linesplits[2])
        if abbre not in abbreviations:
            abbreviations.append(abbre)
    return abbreviations
Esempio n. 33
0
def cls_tf_idf(model, label, split):
    fr = open(
        "processed_data{}/{}/{}.{}.{}.unable.label".format(
            modified, task_name, task_name, split, label), 'r')
    lines = []
    for l in fr:
        lines.append(l)
    fr.close()

    fw = open(
        "processed_data{}/{}/{}.{}.{}.data.label".format(
            modified, task_name, task_name, split, label), 'a')
    line_num = min(len(lines), max_line)
    for i in range(0, line_num, batch_size):
        batch_range = min(batch_size, line_num - i)
        batch_lines = lines[i:i + batch_range]
        batch_x = [clean_str(sent) for sent in batch_lines]
        pred, attn = model(batch_x)
        pred = np.argmax(pred.cpu().data.numpy(), axis=1)
        for line, x, pre, att in zip(batch_lines, batch_x, pred, attn):
            if len(x) > 0:
                att = att[:len(x)]
                if task_name == 'yelp':
                    avg = torch.mean(att)
                elif task_name == 'amazon':
                    avg = 0.4
                else:
                    avg = torch.mean(att) * 0.5 + 0.4 * 0.5
                mask = att.gt(avg)
                if sum(mask).item() == 0:
                    mask = torch.argmax(att).unsqueeze(0)
                else:
                    mask = torch.nonzero(mask.squeeze()).squeeze(1)
                idx = mask.cpu().numpy()
                idx = [int(ix) for ix in idx]
                contents = []
                for i in range(0, len(x)):
                    if i not in idx:
                        contents.append(x[i])
                wl = {
                    "content": ' '.join(contents),
                    "line": line.strip(),
                    "masks": list(idx),
                    "label": str(label)
                }
                #print(wl)
                wl_str = json.dumps(wl)
                fw.write(wl_str)
                fw.write("\n")
    fw.close()
    print("processed over!")
def count_links_from_same_domain(entry):
    #the purpose of this function is to provide a number that will be used as a delay for the threads that will query the same domain
    from utils import clean_str
    kind=entry.get('kind')  #t1 for comments  t3 for posts
    data=entry.get('data')

    if kind=='t3':
        domain=clean_str(data,['domain'])
        domains_d[domain] += 1
        #title=clean_str(data,['title'])
        #log( '{:<20.20}... {:>22.22}.. {0}'.format(title, domain,domains_d[domain]))
        return domain,domains_d[domain]  #returns a count of how many domain in domains_d
    else:
        return '',0
Esempio n. 35
0
def reddit_post_worker(idx, entry, q_out):
    import datetime
    from utils import strip_emoji, pretty_datediff, clean_str
    from reddit import determine_if_video_media_from_reddit_json, ret_sub_icon

    show_listVideos_debug=True
    credate = ""
    is_a_video=False
    title_line2=""
    t_on = translation(30071)  #"on"

    t_pts='c'
    thumb_w=0; thumb_h=0

    try:

        kind=entry.get('kind')  #t1 for comments  t3 for posts
        data=entry.get('data')
        post_id=data.get('name')
        if data:
            if kind=='t3':
                title = clean_str(data,['title'])
                description=clean_str(data,['media','oembed','description'])
                post_selftext=clean_str(data,['selftext'])

                description=post_selftext+'[CR]'+description if post_selftext else description
            else:
                title=clean_str(data,['link_title'])
                description=clean_str(data,['body'])

            title = strip_emoji(title) #an emoji in the title was causing a KeyError  u'\ud83c'

            commentsUrl = urlMain+clean_str(data,['permalink'])


            try:
                aaa = data.get('created_utc')
                credate = datetime.datetime.utcfromtimestamp( aaa )
                now_utc = datetime.datetime.utcnow()
                pretty_date=pretty_datediff(now_utc, credate)
                credate = str(credate)
            except (AttributeError,TypeError,ValueError):
                credate = ""

            subreddit=clean_str(data,['subreddit'])
            author=clean_str(data,['author'])
            domain=clean_str(data,['domain'])

            num_comments = data.get('num_comments',0)

            d_url=clean_str(data,['url'])
            link_url=clean_str(data,['link_url'])
            media_oembed_url=clean_str(data,['media','oembed','url'])

            media_url=next((item for item in [d_url,link_url,media_oembed_url] if item ), '')


            thumb=clean_str(data,['thumbnail'])


            if not thumb.startswith('http'): #in ['nsfw','default','self']:  #reddit has a "default" thumbnail (alien holding camera with "?")
                thumb=""

            if thumb=="":
                thumb=clean_str(data,['media','oembed','thumbnail_url']).replace('&amp;','&')

            if thumb=="":  #use this subreddit's icon if thumb still empty
                try: thumb=ret_sub_icon(subreddit)
                except: pass

            try:

                preview=data.get('preview')['images'][0]['source']['url'].encode('utf-8').replace('&amp;','&')

                try:
                    thumb_h = float( data.get('preview')['images'][0]['source']['height'] )
                    thumb_w = float( data.get('preview')['images'][0]['source']['width'] )
                except (AttributeError,TypeError,ValueError):
                    thumb_w=0; thumb_h=0

            except Exception as e:

                thumb_w=0; thumb_h=0; preview="" #a blank preview image will be replaced with poster_url from make_addon_url_from() for domains that support it

            is_a_video = determine_if_video_media_from_reddit_json(data)

            over_18=data.get('over_18')

            title_line2=""


            title_line2 = "[I][COLOR dimgrey]%s %s [COLOR cadetblue]r/%s[/COLOR] (%d) %s[/COLOR][/I]" %(pretty_date,t_on, subreddit,num_comments, t_pts)

            if show_listVideos_debug : log("  POST%cTITLE%.2d=%s" %( ("v" if is_a_video else " "), idx, title ))


            tuple_for_addDirectoryItems=addLink(title=title,
                    title_line2=title_line2,
                    iconimage=thumb,
                    previewimage=preview,
                    preview_w=thumb_w,
                    preview_h=thumb_h,
                    domain=domain,
                    description=description,
                    credate=credate,
                    reddit_says_is_video=is_a_video,
                    commentsUrl=commentsUrl,
                    subreddit=subreddit,
                    media_url=media_url,
                    over_18=over_18,
                    posted_by=author,
                    num_comments=num_comments,
                    post_index=idx,
                    post_id=post_id
                    )

            q_out.put( [idx, tuple_for_addDirectoryItems] )
    except Exception as e:
        log( '  #reddit_post_worker EXCEPTION:' + repr(sys.exc_info()) +'--'+ str(e) )
def autoPlay(url, name, autoPlay_type):
    import random
    from domains import sitesBase, parse_reddit_link, build_DirectoryItem_url_based_on_media_type
    from utils import unescape, post_is_filtered_out, log, clean_str
    from actions import setting_gif_repeat_count
    from reddit import reddit_request, determine_if_video_media_from_reddit_json
    #collect a list of title and urls as entries[] from the j_entries obtained from reddit
    #then create a playlist from those entries
    #then play the playlist

    gif_repeat_count=setting_gif_repeat_count()

    entries = []
    playlist = xbmc.PlayList(xbmc.PLAYLIST_VIDEO)
    playlist.clear()
    log("**********autoPlay %s*************" %autoPlay_type)
    content = reddit_request(url)
    if not content: return

    content = json.loads(content.replace('\\"', '\''))

    log("Autoplay %s - Parsing %d items" %( autoPlay_type, len(content['data']['children']) )    )

    for j_entry in content['data']['children']:
        try:
            if post_is_filtered_out( j_entry ):
                continue

            title = clean_str(j_entry, ['data','title'])

            try:
                media_url = j_entry['data']['url']
            except:
                media_url = j_entry['data']['media']['oembed']['url']

            is_a_video = determine_if_video_media_from_reddit_json(j_entry)

            #log("  Title:%s -%c %s"  %( title, ("v" if is_a_video else " "), media_url ) )
            #hoster, DirectoryItem_url, videoID, mode_type, thumb_url,poster_url, isFolder,setInfo_type, IsPlayable=make_addon_url_from(media_url,is_a_video)
            ld=parse_reddit_link(link_url=media_url, assume_is_video=is_a_video, needs_preview=False, get_playable_url=True )

            DirectoryItem_url, setProperty_IsPlayable, isFolder, title_prefix = build_DirectoryItem_url_based_on_media_type(ld, media_url, title, on_autoplay=True)

            if ld:
                if ld.media_type not in [sitesBase.TYPE_VIDEO, sitesBase.TYPE_GIF, sitesBase.TYPE_VIDS, sitesBase.TYPE_MIXED]:
                    continue

            autoPlay_type_entries_append( entries, autoPlay_type, title, DirectoryItem_url)
            if ld.media_type == sitesBase.TYPE_GIF:
                for _ in range( 0, gif_repeat_count ):
                    autoPlay_type_entries_append( entries, autoPlay_type, title, DirectoryItem_url)

        except Exception as e:
            log("  EXCEPTION Autoplay "+ str( sys.exc_info()[0]) + "  " + str(e) )

    #def k2(x): return x[1]
    #entries=remove_duplicates(entries, k2)

    if autoplayRandomize:
        random.shuffle(entries)

    #for title, url in entries:
    #    log("  added to playlist:"+ title + "  " + urllib.unquote_plus(url) )
    for title, url in entries:
        listitem = xbmcgui.ListItem(title)
        playlist.add(url, listitem)
        log('add to playlist: %s %s' %(title.ljust(25)[:25],url ))
    xbmc.Player().play(playlist)
def reddit_post_worker(idx, entry, q_out, delay=0):
    import datetime
    from utils import pretty_datediff, clean_str, get_int, format_description
    from reddit import determine_if_video_media_from_reddit_json
    from domains import sitesBase

    if delay>0:
        xbmc.Monitor().waitForAbort( float(delay)/1000 )         #xbmc.sleep(delay)
    try:
        credate = ""
        is_a_video=False
        title_line2=""
        thumb_w=0; thumb_h=0

        t_on = translation(32071)  #"on"
        #t_pts = u"\U0001F4AC"  # translation(30072) #"cmnts"  comment bubble symbol. doesn't work
        t_pts = u"\U00002709"  # translation(30072)   envelope symbol
        t_up = u"\U000025B4"  #u"\U00009650"(up arrow)   #upvote symbol

        #on 3/21/2017 we're adding a new feature that lets users view their saved posts by entering /user/username/saved as their subreddit.
        #  in addition to saved posts, users can also save comments. we need to handle it by checking for "kind"
        kind=entry.get('kind')  #t1 for comments  t3 for posts
        data=entry.get('data')
        if data:
            if kind=='t3':
                title = clean_str(data,['title'])
                description=clean_str(data,['media','oembed','description'])
                post_selftext=clean_str(data,['selftext'])

                description=post_selftext+'\n'+description if post_selftext else description
                domain=clean_str(data,['domain'])
            else:
                title=clean_str(data,['link_title'])
                description=clean_str(data,['body'])
                domain='Comment post'

            description=format_description(description, hide_text_in_parens=False)
            first_link_in_description=None

            #title=strip_emoji(title) #an emoji in the title was causing a KeyError  u'\ud83c'
            title=format_description(title)

            is_a_video = determine_if_video_media_from_reddit_json(entry)
            log("  POS%s%cTITLE%.2d=%s d=%d" %( kind, ("v" if is_a_video else " "), idx, title,delay ))
            #log("description%.2d=%s" %(idx,description))
            post_id = entry['kind'] + '_' + data.get('id')  #same as entry['data']['name']
            #log('  %s  %s ' % (post_id, entry['data']['name'] ))
            commentsUrl = urlMain+clean_str(data,['permalink'])
            #log("commentsUrl"+str(idx)+"="+commentsUrl)
            try:
                aaa = data.get('created_utc')
                credate = datetime.datetime.utcfromtimestamp( aaa )
                now_utc = datetime.datetime.utcnow()
                pretty_date=pretty_datediff(now_utc, credate)
                credate = str(credate)
            except (AttributeError,TypeError,ValueError):
                credate = ""

            subreddit=clean_str(data,['subreddit'])
            author=clean_str(data,['author'])
            #log("     DOMAIN%.2d=%s" %(idx,domain))

            #post_excluded_from() is a misnomer. it just returns true if subreddit is in csv-list
            if (post_excluded_from( use_first_link_in_textpost_for_the_following_subreddits, subreddit) or
                post_excluded_from( use_first_link_in_textpost_for_the_following_subreddits, 'all')     and
                domain.startswith('self.')):
                first_link_in_description=sitesBase.get_first_url_from(description)
                #override the domain so that bottom right of gui matches link
                if first_link_in_description:
                    domain = '({uri.netloc})'.format( uri=urlparse.urlparse( first_link_in_description ) )

            ups = data.get('score',0)       #downs not used anymore
            num_comments = data.get('num_comments',0)

            d_url=clean_str(data,['url'])
            link_url=clean_str(data,['link_url'])
            media_oembed_url=clean_str(data,['media','oembed','url'])
#            log('   kind     ='+kind)
#            log('    url     ='+d_url)
#            log('    link_url='+link_url)
#            log('   permalink='+clean_str(data,['permalink']))
#            log('    media_oembed_url='+media_oembed_url)
            media_url=next((item for item in [first_link_in_description,d_url,link_url,media_oembed_url] if item ), '')
            #log("     MEDIA%.2d=%s" %(idx,media_url))

            thumb=clean_str(data,['thumbnail'])

            #media_w=get_int(data,['media','oembed','width'])
            #media_h=get_int(data,['media','oembed','height'])
            #log('  media_w='+repr(media_w)+' h='+repr(media_h) )

            #try:log('  media_w='+repr(data.get('media')['oembed']['width']  ) )
            #except:pass

            if not thumb.startswith('http'): #in ['nsfw','default','self']:  #reddit has a "default" thumbnail (alien holding camera with "?")
                thumb=""

            if thumb=="":
                thumb=clean_str(data,['media','oembed','thumbnail_url']).replace('&amp;','&')

            #a blank preview image will be replaced with poster_url from parse_reddit_link() for domains that support it
            preview=clean_str(data,['preview','images',0,'source','url']).replace('&amp;','&') #data.get('preview')['images'][0]['source']['url'].encode('utf-8').replace('&amp;','&')
            #log('  preview='+repr(preview))
            #try:
            thumb_h=get_int(data,['preview','images',0,'source','height'])#float( data.get('preview')['images'][0]['source']['height'] )
            thumb_w=get_int(data,['preview','images',0,'source','width']) #float( data.get('preview')['images'][0]['source']['width'] )
            #except (AttributeError,TypeError,ValueError):
                #log("   thumb_w _h EXCEPTION:="+ str( sys.exc_info()[0]) + "  " + str(e) )
            #   thumb_w=0; thumb_h=0

            #preview images are 'keep' stretched to fit inside 1080x1080.
            #  if preview image is smaller than the box we have for thumbnail, we'll use that as thumbnail and not have a bigger stretched image
            if thumb_w > 0 and thumb_w < 280:
                #log('*******preview is small ')
                thumb=preview
                thumb_w=0; thumb_h=0; preview=""

            over_18=data.get('over_18')

            title_line2=""
            title_line2 = "[I][COLOR dimgrey]%d%c %s %s [B][COLOR cadetblue]r/%s[/COLOR][/B] (%d) %s[/COLOR][/I]" %(ups,t_up,pretty_date,t_on, subreddit,num_comments, t_pts)

            liz=addLink(title=title,
                    title_line2=title_line2,
                    iconimage=thumb,
                    previewimage=preview,
                    preview_w=thumb_w,
                    preview_h=thumb_h,
                    domain=domain,
                    description=description,
                    credate=credate,
                    reddit_says_is_video=is_a_video,
                    commentsUrl=commentsUrl,
                    subreddit=subreddit,
                    link_url=media_url,
                    over_18=over_18,
                    posted_by=author,
                    num_comments=num_comments,
                    post_id=post_id,
                    )

            q_out.put( [idx, liz] )  #we put the idx back for easy sorting

    except Exception as e:
        log( '  #reddit_post_worker EXCEPTION:' + repr(sys.exc_info()) +'--'+ str(e) )
def listLinksInComment(url, name, type_):
    from guis import progressBG
    from reddit import reddit_request
    from utils import clean_str,remove_duplicates, is_filtered
    from default import comments_link_filter

    log('listLinksInComment:%s:%s' %(type_,url) )

    post_title=''
    global harvest
#    ShowOnlyCommentsWithlink=False
#    if type_=='linksOnly':
#        ShowOnlyCommentsWithlink=True

    #url='https://np.reddit.com/r/videos/comments/64j9x7/doctor_violently_dragged_from_overbooked_cia/dg2pbtj/?st=j1cbxsst&sh=2d5daf4b'
    #url=url.split('?', 1)[0]+'.json'+url.split('?', 1)[1]

    #log(repr(url.split('?', 1)[0]))
    #log(repr(url.split('?', 1)[1]))
    #log(repr(url.split('?', 1)[0]+'.json?'+url.split('?', 1)[1]))

    #url='https://www.reddit.com/r/Music/comments/4k02t1/bonnie_tyler_total_eclipse_of_the_heart_80s_pop/' + '.json'
    #only get up to "https://www.reddit.com/r/Music/comments/4k02t1".
    #   do not include                                            "/bonnie_tyler_total_eclipse_of_the_heart_80s_pop/"
    #   because we'll have problem when it looks like this: "https://www.reddit.com/r/Overwatch/comments/4nx91h/ever_get_that_feeling_déjà_vu/"
    #url=re.findall(r'(.*/comments/[A-Za-z0-9]+)',url)[0]
    #UPDATE you need to convert this: https://www.reddit.com/r/redditviewertesting/comments/4x8v1k/test_test_what_is_déjà_vu/
    #                        to this: https://www.reddit.com/r/redditviewertesting/comments/4x8v1k/test_test_what_is_d%C3%A9j%C3%A0_vu/
    #
    #use safe='' argument in quoteplus to encode only the weird chars part
    url=urllib.quote_plus(url,safe=':/?&')

    if '?' in url:
        url=url.split('?', 1)[0]+'.json?'+url.split('?', 1)[1]
    else:
        url+= '.json'

    xbmc_busy()

    loading_indicator=progressBG('Loading...')
    loading_indicator.update(0,'Retrieving comments')
    content = reddit_request(url)
    loading_indicator.update(10,'Parsing')

    if not content:
        loading_indicator.end()
        return

    try:
        xbmc_busy()
        content = json.loads(content)

        #harvest links in the post text (just 1)
        r_linkHunter(content[0]['data']['children'])

        #submitter=content[0]['data']['children'][0]['data']['author']
        submitter=clean_str(content,[0,'data','children',0,'data','author'])

        #the post title is provided in json, we'll just use that instead of messages from addLink()
        #post_title=content[0]['data']['children'][0]['data']['title']
        post_title=clean_str(content,[0,'data','children',0,'data','title'])

        #harvest links in the post itself
        r_linkHunter(content[1]['data']['children'])
        #for i, h in enumerate(harvest):
        #    log( '  %d %s %.4d -%s   link[%s]' % ( i, h[7].ljust(8)[:8], h[0], h[3].ljust(20)[:20],h[2] ) )

        comments_count_orig=len(harvest)
        #log(' len harvest1 '+repr(len(harvest)))
        #remove duplicate links
        def k2(x): return (x[2],x[3])
        harvest=remove_duplicates(harvest,k2)
        comments_count_rd=len(harvest)
        #log(' len harvest2 '+repr(len(harvest)))

        loading_indicator.update(15,'Removed %d duplicates' %(comments_count_orig-comments_count_rd) )

        c_threads=[]
        q_liz=Queue()
        comments_count=len(harvest)
        filtered_posts=0
        for idx, h in enumerate(harvest):
            comment_score=h[0]
            link_url=h[2]
            if comment_score < int_CommentTreshold:
                log('    comment score %d < %d, skipped' %(comment_score,int_CommentTreshold) )
                filtered_posts+=1
                continue

            if is_filtered(comments_link_filter,link_url):
                log('    [{0}] is hidden by comments_link_filter'.format(link_url))
                filtered_posts+=1
                continue

            domain,domain_count=count_links_from_same_domain_comments(link_url) #count how many same domains we're hitting
            delay=compute_anti_dos_delay(domain,domain_count)

            #have threads process each comment post
            t = threading.Thread(target=reddit_comment_worker, args=(idx, h,q_liz,submitter,delay), name='#t%.2d'%idx)
            c_threads.append(t)
            t.start()

        #loading_indicator.update(20,'Filtered %d comments' %(filtered_posts) )
        log(repr(domains_d))
        #check the queue to determine progress
        break_counter=0 #to avoid infinite loop
        expected_listitems=(comments_count-filtered_posts)
        if expected_listitems>0:
            loading_indicator.set_tick_total(expected_listitems)
            last_queue_size=0
            while q_liz.qsize() < expected_listitems:
                if break_counter>=100:
                    break
                #each change in the queue size gets a tick on our progress track
                if last_queue_size < q_liz.qsize():
                    items_added=q_liz.qsize()-last_queue_size
                    loading_indicator.tick(items_added,'Parsing')
                else:
                    break_counter+=1

                last_queue_size=q_liz.qsize()
                xbmc.sleep(50)

        #wait for all threads to finish before collecting the list items
        for idx, t in enumerate(c_threads):
            #log('    joining %s' %t.getName())
            t.join(timeout=20)

        xbmc_busy(False)

        #compare the number of entries to the returned results
        #log( "queue:%d entries:%d" %( q_liz.qsize() , len(content['data']['children'] ) ) )
        if q_liz.qsize() != expected_listitems:
            log('some threads did not return a listitem. total comments:%d expecting(%d) but only got(%d)' %(comments_count, expected_listitems, q_liz.qsize()))

        #for t in threads: log('isAlive %s %s' %(t.getName(), repr(t.isAlive()) )  )
        li=[ liz for idx,liz in sorted(q_liz.queue) ]
        #log(repr(li))

        with q_liz.mutex:
            q_liz.queue.clear()

    except Exception as e:
        log('  ' + str(e) )

    loading_indicator.end() #it is important to close xbmcgui.DialogProgressBG
# this portion is abandoned for now. initial plan was to textbox with auto-height in a grouplist to mimic the comment tree but cannot figure out how links can be followed.
    from guis import comments_GUI2
    ui = comments_GUI2('view_464_comments_grouplist.xml' , addon_path, defaultSkin='Default', defaultRes='1080i', listing=li, id=55)
    #ui = comments_GUI2('aaa.xml' , addon_path, defaultSkin='Default', defaultRes='1080i', listing=li, id=55)
    ui.title_bar_text=post_title
    ui.doModal()
    del ui
    return
def reddit_post_worker(idx, entry, q_out):
    import datetime
    from utils import strip_emoji, pretty_datediff, clean_str
    from reddit import determine_if_video_media_from_reddit_json, ret_sub_icon

    show_listVideos_debug=True
    credate = ""
    is_a_video=False
    title_line2=""
    t_on = translation(30071)  #"on"
    #t_pts = u"\U0001F4AC"  # translation(30072) #"cmnts"  comment bubble symbol. doesn't work
    #t_pts = u"\U00002709"  # translation(30072)   envelope symbol
    t_pts='c'
    thumb_w=0; thumb_h=0

    try:
        #on 3/21/2017 we're adding a new feature that lets users view their saved posts by entering /user/username/saved as their subreddit.
        #  in addition to saved posts, users can also save comments. we need to handle it by checking for "kind"
        kind=entry.get('kind')  #t1 for comments  t3 for posts
        data=entry.get('data')
        post_id=data.get('name')
        if data:
            if kind=='t3':
                title = clean_str(data,['title'])
                description=clean_str(data,['media','oembed','description'])
                post_selftext=clean_str(data,['selftext'])

                description=post_selftext+'[CR]'+description if post_selftext else description
            else:
                title=clean_str(data,['link_title'])
                description=clean_str(data,['body'])

            title = strip_emoji(title) #an emoji in the title was causing a KeyError  u'\ud83c'

            commentsUrl = urlMain+clean_str(data,['permalink'])
            #if show_listVideos_debug :log("commentsUrl"+str(idx)+"="+commentsUrl)

            try:
                aaa = data.get('created_utc')
                credate = datetime.datetime.utcfromtimestamp( aaa )
                now_utc = datetime.datetime.utcnow()
                pretty_date=pretty_datediff(now_utc, credate)
                credate = str(credate)
            except (AttributeError,TypeError,ValueError):
                credate = ""

            subreddit=clean_str(data,['subreddit'])
            author=clean_str(data,['author'])
            domain=clean_str(data,['domain'])
            #log("     DOMAIN%.2d=%s" %(idx,domain))

            #ups = data.get('score',0)       #downs not used anymore
            num_comments = data.get('num_comments',0)
            #description = "[COLOR blue]r/"+ subreddit + "[/COLOR]  [I]" + str(ups)+" pts  |  "+str(comments)+" cmnts  |  by "+author+"[/I]\n"+description
            #description = "[COLOR blue]r/"+ subreddit + "[/COLOR]  [I]" + str(ups)+" pts.  |  by "+author+"[/I]\n"+description
            #description = title_line2+"\n"+description
            #if show_listVideos_debug :log("DESCRIPTION"+str(idx)+"=["+description+"]")
            d_url=clean_str(data,['url'])
            link_url=clean_str(data,['link_url'])
            media_oembed_url=clean_str(data,['media','oembed','url'])

            media_url=next((item for item in [d_url,link_url,media_oembed_url] if item ), '')
            #log("          url"+str(idx)+"="+media_url)

            thumb=clean_str(data,['thumbnail'])
            #if show_listSubReddit_debug : log("       THUMB%.2d=%s" %( idx, thumb ))

            if not thumb.startswith('http'): #in ['nsfw','default','self']:  #reddit has a "default" thumbnail (alien holding camera with "?")
                thumb=""

            if thumb=="":
                thumb=clean_str(data,['media','oembed','thumbnail_url']).replace('&amp;','&')

            if thumb=="":  #use this subreddit's icon if thumb still empty
                try: thumb=ret_sub_icon(subreddit)
                except: pass

            try:
                #collect_thumbs(entry)
                preview=data.get('preview')['images'][0]['source']['url'].encode('utf-8').replace('&amp;','&')
                #poster = entry['data']['media']['oembed']['thumbnail_url'].encode('utf-8')
                #t=thumb.split('?')[0]
                #can't preview gif thumbnail on thumbnail view, use alternate provided by reddit
                #if t.endswith('.gif'):
                    #log('  thumb ends with .gif')
                #    thumb = entry['data']['thumbnail'].encode('utf-8')
                try:
                    thumb_h = float( data.get('preview')['images'][0]['source']['height'] )
                    thumb_w = float( data.get('preview')['images'][0]['source']['width'] )
                except (AttributeError,TypeError,ValueError):
                    thumb_w=0; thumb_h=0

            except Exception as e:
                #log("   getting preview image EXCEPTION:="+ str( sys.exc_info()[0]) + "  " + str(e) )
                thumb_w=0; thumb_h=0; preview="" #a blank preview image will be replaced with poster_url from make_addon_url_from() for domains that support it

            is_a_video = determine_if_video_media_from_reddit_json(data)

            over_18=data.get('over_18')

            #setting: toggle showing 2-line title
            #log("   TitleAddtlInfo "+str(idx)+"="+str(TitleAddtlInfo))
            title_line2=""
            #if TitleAddtlInfo:
            #title_line2 = "[I][COLOR dimgrey]%s by %s [COLOR darkslategrey]r/%s[/COLOR] %d pts.[/COLOR][/I]" %(pretty_date,author,subreddit,ups)
            #title_line2 = "[I][COLOR dimgrey]"+pretty_date+" by "+author+" [COLOR darkslategrey]r/"+subreddit+"[/COLOR] "+str(ups)+" pts.[/COLOR][/I]"

            title_line2 = "[I][COLOR dimgrey]%s %s [COLOR cadetblue]r/%s[/COLOR] (%d) %s[/COLOR][/I]" %(pretty_date,t_on, subreddit,num_comments, t_pts)
            #title_line2 = "[I]"+str(idx)+". [COLOR dimgrey]"+ media_url[0:50]  +"[/COLOR][/I] "  # +"    "+" [COLOR darkslategrey]r/"+subreddit+"[/COLOR] "+str(ups)+" pts.[/COLOR][/I]"

            #if show_listVideos_debug : log( ("v" if is_a_video else " ") +"     TITLE"+str(idx)+"="+title)
            if show_listVideos_debug : log("  POST%cTITLE%.2d=%s" %( ("v" if is_a_video else " "), idx, title ))
            #if show_listVideos_debug :log("      OVER_18"+str(idx)+"="+str(over_18))
            #if show_listVideos_debug :log("   IS_A_VIDEO"+str(idx)+"="+str(is_a_video))
            #if show_listVideos_debug :log("        THUMB"+str(idx)+"="+thumb)
            #if show_listVideos_debug :log("    MediaURL%.2d=%s" % (idx,media_url) )

            #if show_listVideos_debug :log("       HOSTER"+str(idx)+"="+hoster)
            #log("    VIDEOID"+str(idx)+"="+videoID)
            #log( "["+description+"]1["+ str(date)+"]2["+ str( count)+"]3["+ str( commentsUrl)+"]4["+ str( subreddit)+"]5["+ video_url +"]6["+ str( over_18))+"]"

            tuple_for_addDirectoryItems=addLink(title=title,
                    title_line2=title_line2,
                    iconimage=thumb,
                    previewimage=preview,
                    preview_w=thumb_w,
                    preview_h=thumb_h,
                    domain=domain,
                    description=description,
                    credate=credate,
                    reddit_says_is_video=is_a_video,
                    commentsUrl=commentsUrl,
                    subreddit=subreddit,
                    media_url=media_url,
                    over_18=over_18,
                    posted_by=author,
                    num_comments=num_comments,
                    post_index=idx,
                    post_id=post_id
                    )

            q_out.put( [idx, tuple_for_addDirectoryItems] )
    except Exception as e:
        log( '  #reddit_post_worker EXCEPTION:' + repr(sys.exc_info()) +'--'+ str(e) )