def determine_if_video_media_from_reddit_json(data): #reads the reddit json and determines if link is a video is_a_video = False media_url = clean_str(data, ['media', 'oembed', 'url'], '') if media_url == '': media_url = clean_str(data, ['url']) # also check "post_hint" : "rich:video" media_url = media_url.split('?')[0] #get rid of the query string try: zzz = data['media']['oembed']['type'] #log(" zzz"+str(idx)+"="+str(zzz)) if zzz == None: #usually, entry['data']['media'] is null for not videos but it is also null for gifv especially nsfw if ".gifv" in media_url.lower(): #special case for imgur is_a_video = True else: is_a_video = False elif zzz == 'video': is_a_video = True else: is_a_video = False except (KeyError, TypeError, AttributeError): is_a_video = False return is_a_video
def determine_if_video_media_from_reddit_json( data ): from utils import clean_str is_a_video=False media_url=clean_str(data,['media','oembed','url'],'') if media_url=='': media_url=clean_str(data,['url']) media_url=media_url.split('?')[0] #get rid of the query string try: zzz = data['media']['oembed']['type'] if zzz == None: #usually, entry['data']['media'] is null for not videos but it is also null for gifv especially nsfw if ".gifv" in media_url.lower(): #special case for imgur is_a_video=True else: is_a_video=False elif zzz == 'video': is_a_video=True else: is_a_video=False except (KeyError,TypeError,AttributeError): is_a_video=False return is_a_video
def determine_if_video_media_from_reddit_json( data ): #reads the reddit json and determines if link is a video is_a_video=False media_url=clean_str(data,['media','oembed','url'],'') if media_url=='': media_url=clean_str(data,['url']) # also check "post_hint" : "rich:video" media_url=media_url.split('?')[0] #get rid of the query string try: zzz = data['media']['oembed']['type'] #log(" zzz"+str(idx)+"="+str(zzz)) if zzz == None: #usually, entry['data']['media'] is null for not videos but it is also null for gifv especially nsfw if ".gifv" in media_url.lower(): #special case for imgur is_a_video=True else: is_a_video=False elif zzz == 'video': is_a_video=True else: is_a_video=False except (KeyError,TypeError,AttributeError): is_a_video=False return is_a_video
def write_sent_data(docs, fdir, balance_classes=False): group_docs = defaultdict(list) for doc in docs: group_docs[doc.group].append(doc) for group, doc_list in group_docs.items(): fout = open('{}/{}.tsv'.format(fdir, group), 'w') for doc in doc_list: sent_labels = [ any([utils.s_overlap(s, f.ev) for f in doc.frames]) for s in doc.sents ] pos_sents = [s.text for s, l in zip(doc.sents, sent_labels) if l] neg_sents = [ s.text for s, l in zip(doc.sents, sent_labels) if not l ] if balance_classes: neg_samples = [] for pos_s in pos_sents: neg_sents = sorted(neg_sents, key=lambda s: abs(len(s) - len(pos_s))) try: neg_samples.append(neg_sents.pop(0)) neg_samples.append(neg_sents.pop(0)) except IndexError: print( 'Warning: unable to sample enough negatives from doc {}' .format(doc.id)) neg_sents = neg_samples for s in pos_sents: fout.write('1\t{}\t{}\n'.format(doc.id, utils.clean_str(s))) for s in neg_sents: fout.write('0\t{}\t{}\n'.format(doc.id, utils.clean_str(s)))
def r_linkHunter(json_node,d=0): from utils import clean_str #recursive function to harvest stuff from the reddit comments json reply prog = re.compile('<a href=[\'"]?([^\'" >]+)[\'"]>(.*?)</a>') for e in json_node: link_desc="" link_http="" author="" created_utc="" e_data=e.get('data') score=e_data.get('score',0) if e['kind']=='t1': #'t1' for comments 'more' for more comments (not supported) #log("replyid:"+str(d)+" "+e['data']['id']) #body=e['data']['body'].encode('utf-8') #log("reply:"+str(d)+" "+body.replace('\n','')[0:80]) try: replies=e_data.get('replies')['data']['children'] except (AttributeError,TypeError): replies="" post_text=clean_str(e_data,['body']) post_text=post_text.replace("\n\n","\n") post_html=clean_str(e_data,['body_html']) created_utc=e_data.get('created_utc','') author=clean_str(e_data,['author']) #i initially tried to search for [link description](https:www.yhotuve.com/...) in the post_text but some posts do not follow this convention #prog = re.compile('\[(.*?)\]\((https?:\/\/.*?)\)') #result = prog.findall(post_text) result = prog.findall(post_html) if result: #store the post by itself and then a separate one for each link. harvest.append((score, link_desc, link_http, post_text, post_html, d, "t1",author,created_utc,) ) for link_http,link_desc in result: harvest.append((score, link_desc, link_http, link_desc, post_html, d, "t1",author,created_utc,) ) else: harvest.append((score, link_desc, link_http, post_text, post_html, d, "t1",author,created_utc,) ) d+=1 #d tells us how deep is the comment in r_linkHunter(replies,d) d-=1 if e['kind']=='t3': #'t3' for post text (a description of the post) self_text=clean_str(e_data,['selftext']) self_text_html=clean_str(e_data,['selftext_html']) result = prog.findall(self_text_html) if len(result) > 0 : harvest.append((score, link_desc, link_http, self_text, self_text_html, d, "t3",author,created_utc, ) ) for link_http,link_desc in result: harvest.append((score, link_desc, link_http, link_desc, self_text_html, d, "t3",author,created_utc, ) ) else: if len(self_text) > 0: #don't post an empty titles harvest.append((score, link_desc, link_http, self_text, self_text_html, d, "t3",author,created_utc,) )
def write_o_ev_data_pipeline(docs, fdir): fout = open('{}/{}.tsv'.format(fdir, docs[0].group), 'w') for doc in docs: assert doc.group == 'test' or doc.group == 'testtest' for ev_span in doc.labels['BERT_ev']: for o_span in utils.s_overlaps(ev_span, doc.labels['NER_o']): fout.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format('0', doc.id, o_span.i, o_span.f, \ utils.clean_str(o_span.text), utils.clean_str(ev_span.text)))
def load_train_val_dataset(self, questions, duplicates, label): num_positive_samples = 145000 num_negative_samples = 145000 tr_idx = 250000 val_idx = 290000 # sample positive and negative labels positive_labels = [] negative_labels = [] for index, value in enumerate(label): if value == 1: positive_labels.append(index) else: negative_labels.append(index) positive_selection = random.sample(positive_labels, num_positive_samples) negative_selection = random.sample(negative_labels, num_negative_samples) selection = positive_selection + negative_selection random.shuffle(selection) #sample questions based on selected labels questions = [questions[i] for i in selection] duplicates = [duplicates[i] for i in selection] label = [label[i] for i in selection] questions = [utils.clean_str(sentence) for sentence in questions] questions_seq = self.tokenizer.texts_to_sequences(questions) questions_data = pad_sequences(questions_seq, self.opts.max_sequence_len) tr_input_one = questions_data[0:tr_idx] val_input_one = questions_data[tr_idx:val_idx] duplicates = [utils.clean_str(sentence) for sentence in duplicates] duplicates_seq = self.tokenizer.texts_to_sequences(duplicates) duplicates_data = pad_sequences(duplicates_seq, self.opts.max_sequence_len) tr_input_two = duplicates_data[0:tr_idx] val_input_two = duplicates_data[tr_idx:val_idx] target = label[0:tr_idx] target_val = label[tr_idx:val_idx] # convert labels to one-hot encoding tr_label = np_utils.to_categorical(target) val_label = np_utils.to_categorical(target_val) dataset = { 'tr_input_one': tr_input_one, 'tr_input_two': tr_input_two, 'tr_label': tr_label, 'val_input_one': val_input_one, 'val_input_two': val_input_two, 'val_label': val_label } return dataset
def get_clean_words(docs): clean_words = [] for doc in docs: if args.dataset != "mr": temp = clean_str(doc).split() temp = list(filter(lambda x : x not in stop_words, temp)) else: temp = clean_str(doc).split() clean_words.append(temp) return clean_words
def process(eval_tuple, baseline=False): dbr = clean_str(eval_tuple['dbr']) question = clean_str(eval_tuple['question']) match_fn = simple_match if baseline else match_by_trigrams mention = match_fn(dbr, question) # mention = merge(dbr, match_by_trigrams(dbr, question), simple_match(dbr, question)) if mention is None: mention = '' output = eval_tuple.copy() output['mention'] = mention return output
def r_linkHunter(json_node,d=0): from utils import clean_str prog = re.compile('<a href=[\'"]?([^\'" >]+)[\'"]>(.*?)</a>') for e in json_node: link_desc="" link_http="" author="" created_utc="" e_data=e.get('data') score=e_data.get('score',0) if e['kind']=='t1': #'t1' for comments 'more' for more comments (not supported) try: replies=e_data.get('replies')['data']['children'] except (AttributeError,TypeError): replies="" post_text=clean_str(e_data,['body']) post_text=post_text.replace("\n\n","\n") post_html=clean_str(e_data,['body_html']) created_utc=e_data.get('created_utc','') author=clean_str(e_data,['author']) result = prog.findall(post_html) if result: harvest.append((score, link_desc, link_http, post_text, post_html, d, "t1",author,created_utc,) ) for link_http,link_desc in result: harvest.append((score, link_desc, link_http, link_desc, post_html, d, "t1",author,created_utc,) ) else: harvest.append((score, link_desc, link_http, post_text, post_html, d, "t1",author,created_utc,) ) d+=1 #d tells us how deep is the comment in r_linkHunter(replies,d) d-=1 if e['kind']=='t3': #'t3' for post text (a description of the post) self_text=clean_str(e_data,['selftext']) self_text_html=clean_str(e_data,['selftext_html']) result = prog.findall(self_text_html) if len(result) > 0 : harvest.append((score, link_desc, link_http, self_text, self_text_html, d, "t3",author,created_utc, ) ) for link_http,link_desc in result: harvest.append((score, link_desc, link_http, link_desc, self_text_html, d, "t3",author,created_utc, ) ) else: if len(self_text) > 0: #don't post an empty titles harvest.append((score, link_desc, link_http, self_text, self_text_html, d, "t3",author,created_utc,) )
def create_mr(): pos = read_file('raw_datasets/rt-polarity.pos') neg = read_file('raw_datasets/rt-polarity.neg') # build matrices X, y = [], [] for sent in pos: X.append(clean_str(sent)) y.append([0,1]) for sent in neg: X.append(clean_str(sent)) y.append([1,0]) # build vocab mr_vocab = Vocabulary(X) print('vocab', len(mr_vocab.vocab)) # encode sents max_seq_len = compute_avg_len(X) for i in range(len(X)): X[i] = encode_sent(X[i].split(' '), mr_vocab.encoding, max_seq_len) # build embeddings embeddings = [] for name, (emb_vocab, emb_vectors) in embeddings_map.items(): embedding, found = create_embeddings( mr_vocab, emb_vocab, emb_vectors, 300 ) embeddings.append(embedding) print('{} - {}'.format(name, found)) w2v_embeddings, glove_embeddings, nb_embeddings = embeddings # shuffle X, y = np.array(X), np.array(y) indices = np.random.permutation(len(X)) X, y = X[indices], y[indices] split_idx = int(len(X) * 0.9) X_train, X_valid = X[:split_idx], X[split_idx:] y_train, y_valid = y[:split_idx], y[split_idx:] print('train', X_train.shape, y_train.shape) print('valid', X_valid.shape, y_valid.shape) # save objects save_object('datasets/mr_train', (X_train, y_train)) save_object('datasets/mr_valid', (X_valid, y_valid)) save_object('datasets/mr_vocab', mr_vocab) save_object('datasets/mr_w2v_embs', w2v_embeddings) save_object('datasets/mr_glove_embs', glove_embeddings) save_object('datasets/mr_nb_embs', nb_embeddings)
def write_i_c_o_data(docs, fdir): group_docs = defaultdict(list) for doc in docs: group_docs[doc.group].append(doc) for group, doc_list in group_docs.items(): fout = open('{}/{}.tsv'.format(fdir, group), 'w') for doc in doc_list: for frame in doc.frames: i_text = utils.clean_str(frame.i.text) c_text = utils.clean_str(frame.c.text) o_text = utils.clean_str(frame.o.text) s1 = '{} vs. {}'.format(i_text, c_text) s2 = '{}'.format(o_text) fout.write('{}\t{}\t{}\t{}\n'.format(frame.label + 1, doc.id, s1, s2))
def main(args): device = flow.device("cpu") if args.no_cuda else flow.device("cuda") with open(args.config_path, "r") as f: config = json.load(f) with open(args.vocab_path, "rb") as f: vocab = pickle.load(f) textcnn = textCNN( word_emb_dim=config["word_emb_dim"], vocab_size=len(vocab), dim_channel=config["dim_channel"], kernel_wins=config["kernel_wins"], dropout_rate=config["dropout_rate"], num_class=config["num_class"], max_seq_len=config["max_seq_len"], ) textcnn.load_state_dict(flow.load(args.model_path)) textcnn.eval() textcnn.to(device) text = utils.clean_str(args.text) text = [utils.tokenizer(text)] input = flow.tensor(np.array(utils.tensorize_data(text, vocab, max_len=200)), dtype=flow.long).to(device) predictions = textcnn(input).softmax() predictions = predictions.numpy() clsidx = np.argmax(predictions) print("predict prob: %f, class name: %s" % (np.max(predictions), clsidx))
def fileToMat(filename, w2vec, maxLen, label_set, train=True): kkma = Kkma() train_f = open(filename, 'r', encoding='utf-8') mat = [] line_num = 0 for line in train_f.read().splitlines(): sen = {} line_splitted = line.split('\t') sbj = line_splitted[0].strip() obj = line_splitted[1].strip() relation = line_splitted[2].strip() sentence = line_splitted[3].strip() sentence_complete = re.sub('<< _obj_ >>', obj, re.sub('<< _sbj_ >>', sbj, sentence)) sentence_complete = utils.clean_str(sentence_complete) tokens = [ p[0] + '/' + p[1] for p in kkma.pos(sentence_complete) if p[0] + '/' + p[1] in w2vec.vocab ] if maxLen < len(tokens): if train: maxLen = len(tokens) else: tokens = tokens[:maxLen] label_set.add(relation) sen['sbj'] = sbj sen['obj'] = obj sen['relation'] = relation sen['sentence'] = sentence sen['tokens'] = tokens mat.append(sen) line_num += 1 train_f.close() return mat, label_set, maxLen, line_num
def parse_sstb(dir, binary=False): ''' Hardcoded solution to process SSTB due to its unique formatting :param dir: SSTB directory :return: dictionary containing phrase/label combinations ''' dict = open(dir + 'dictionary.txt', 'r').read() labels = open(dir + 'sentiment_labels.txt', 'r').read() tuples = [sub.split('|') for sub in dict.split('\n')] phrase_id = {clean_str(x[0]): int(x[1]) for x in tuples if len(x) > 1} tuples2 = [sub.split('|') for sub in labels.split('\n')][1:] id_label = {int(x[0]): float(x[1]) for x in tuples2 if len(x) > 1} phrase_label = { x: transform_label(id_label[phrase_id[x]]) for x in phrase_id.keys() } # Binarize if binary: phrase_label = { x: binarize(y) for (x, y) in phrase_label.items() if y != 2 } return phrase_label
def user_request(self, tokenizer, text, opts, faq_size): ''' user_request retrieves user query modify as fit - make sure each user question is repeated n times (where n= len(faqs)). the idea is to compare the user request to every single faq @argu tokenizer: saved tokenizer object opts: configuration params faq_size: len of the loaded faq list @returns user_input_list_data: list with repeated user input user_input: question asked by user to be matched to known faqs ''' user_input = text # note: make a list of repeated user input same size as faq user_input_list = [user_input for i in range(faq_size)] user_input_list = [ utils.clean_str(sentence) for sentence in user_input_list ] user_input_list_seq = tokenizer.texts_to_sequences(user_input_list) user_input_list_data = pad_sequences(user_input_list_seq, opts.max_sequence_len) return user_input_list_data, user_input
def write_i_c_data_pipeline(docs, context_fn, fdir): group_docs = defaultdict(list) for doc in docs: group_docs[doc.group].append(doc) for group, doc_list in group_docs.items(): fout = open('{}/{}.tsv'.format(fdir, group), 'w') for doc in doc_list: for ev in doc.labels['BERT_ev']: visited_frames = {} for i in doc.labels['NER_i']: k = i.text if k not in visited_frames: context_text = utils.clean_str(context_fn(doc, i, ev)) fout.write('0\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( \ doc.id, ev.i, ev.f, i.i, i.f, utils.clean_str(i.text), context_text)) visited_frames[k] = True
def autoPlay(url, name, autoPlay_type): import random from domains import sitesBase, parse_reddit_link, build_DirectoryItem_url_based_on_media_type from utils import unescape, post_is_filtered_out, log, clean_str from actions import setting_gif_repeat_count from reddit import reddit_request, determine_if_video_media_from_reddit_json gif_repeat_count=setting_gif_repeat_count() entries = [] playlist = xbmc.PlayList(xbmc.PLAYLIST_VIDEO) playlist.clear() log("**********autoPlay %s*************" %autoPlay_type) content = reddit_request(url) if not content: return content = json.loads(content.replace('\\"', '\'')) log("Autoplay %s - Parsing %d items" %( autoPlay_type, len(content['data']['children']) ) ) for j_entry in content['data']['children']: try: if post_is_filtered_out( j_entry ): continue title = clean_str(j_entry, ['data','title']) try: media_url = j_entry['data']['url'] except: media_url = j_entry['data']['media']['oembed']['url'] is_a_video = determine_if_video_media_from_reddit_json(j_entry) ld=parse_reddit_link(link_url=media_url, assume_is_video=is_a_video, needs_preview=False, get_playable_url=True ) DirectoryItem_url, setProperty_IsPlayable, isFolder, title_prefix = build_DirectoryItem_url_based_on_media_type(ld, media_url, title, on_autoplay=True) if ld: if ld.media_type not in [sitesBase.TYPE_VIDEO, sitesBase.TYPE_GIF, sitesBase.TYPE_VIDS, sitesBase.TYPE_MIXED]: continue autoPlay_type_entries_append( entries, autoPlay_type, title, DirectoryItem_url) if ld.media_type == sitesBase.TYPE_GIF: for _ in range( 0, gif_repeat_count ): autoPlay_type_entries_append( entries, autoPlay_type, title, DirectoryItem_url) except Exception as e: log(" EXCEPTION Autoplay "+ str( sys.exc_info()[0]) + " " + str(e) ) if autoplayRandomize: random.shuffle(entries) for title, url in entries: listitem = xbmcgui.ListItem(title) playlist.add(url, listitem) log('add to playlist: %s %s' %(title.ljust(25)[:25],url )) xbmc.Player().play(playlist)
def parse_comments(sents: Sequence[str], table_name): timestamps = [None] texts = [] text = [] for sent in sents: sent = clean_str(sent) date_strings = DATE_REGEX[table_name].findall(sent) if len(date_strings) == 0: text.append(sent) else: for date_string in date_strings: # print(date_string) split = sent.split(date_string) text.append(split[0]) if len(split) > 1: texts.append(' '.join(text)) text = [] sent = split[1] timestamps.append( datetime.strptime(date_string, DATE_STR[table_name])) text.append(sent) texts.append(' '.join(text)) if len(timestamps) > 1: # approximate the first text to be written around the time of the first comment. timestamps[0] = min(timestamps[1:]) return timestamps, texts
def cls_tf_idf(batch_lines): batch_x = [clean_str(sent) for sent in batch_lines] pred, attn = cls(batch_x) pred = np.argmax(pred.cpu().data.numpy(), axis=1) ret = [] for line, x, pre, att in zip(batch_lines, batch_x, pred, attn): if len(x) > 0: att = att[:len(x)] if sys.argv[7] == 'yelp': avg = torch.mean(att) elif sys.argv[7] == 'amazon': avg = 0.4 mask = att.gt(avg) if sum(mask).item() == 0: mask = torch.argmax(att).unsqueeze(0) else: mask = torch.nonzero(mask.squeeze()).squeeze(1) idx = mask.cpu().numpy() idx = [int(ix) for ix in idx] contents = [] for i in range(0, len(x)): if i not in idx: contents.append(x[i]) wl = { "content": ' '.join(contents), "line": line.strip(), "masks": list(idx), "label": sys.argv[1][-1] } #print(wl) wl_str = json.dumps(wl) ret.append(wl_str) return ret
def load_dict(path): dict = Dictionary() concepts =[] for line in codecs.open(path, 'r', 'utf-8'): line = line.strip() if line == '': continue concept = Concept() linesplit = line.split("\t") concept.meshId = linesplit[0].strip() name_synonym = list() for idx in range(1, len(linesplit)): names = linesplit[idx] names = clean_str(names) if opt.use_word2digit: names = wordToDigit(names) if names == '': continue name_synonym.append(names) concept.set_names(name_synonym) concepts.append(concept) dict.set_concepts(concepts) dict.set_id_to_names() return dict
def write_o_ev_data(docs, fdir, add_i=False): group_docs = defaultdict(list) for doc in docs: group_docs[doc.group].append(doc) for group, doc_list in group_docs.items(): fout = open('{}/{}.tsv'.format(fdir, group), 'w') for doc in doc_list: for frame in doc.frames: sents = utils.s_overlaps(frame.ev, doc.sents) ev_text = utils.clean_str(doc.text[sents[0].i:sents[-1].f]) o_text = utils.clean_str(frame.o.text) if add_i: o_text = '{} effect on {}'.format( utils.clean_str(frame.i.text), o_text) fout.write('{}\t{}\t{}\t{}\n'.format(frame.label + 1, doc.id, o_text, ev_text))
def prod_num_of_sold_date(): # obtain the parameters ## parameters with default values para = def_para() if (para is None): return help_prod_num_of_sold_date() prod_id = None if ( request.args.get("prod_id") is None) else request.args.get("prod_id") time_range = None if ( request.args.get("range") is None) else request.args.get("range") ## user given parameters start = utils.clean_str(str(request.args.get("start_date")), strip=True) end = utils.clean_str(str(request.args.get("end_date")), strip=True) # convert to datetime object try: start = datetime.strptime(start, "%Y-%m-%d") end = datetime.strptime(end, "%Y-%m-%d") except: return help_prod_num_of_sold_date(err="Invalid date!") # end date is before start date if (end < start): return help_prod_num_of_sold_date( err="End date is berfore start date!") start = start.strftime("%Y-%m-%d") end = end.strftime("%Y-%m-%d") if (start == end): start += " 00:00:00" end += " 23:59:59" # determine the grouping time range switcher = { "day": "DAY(timestamp)", # group by day "week": "WEEK(timestamp)", # group by week "month": "MONTH(timestamp)" # group by month } if (time_range is not None): time_range = utils.clean_str(time_range.lower(), strip=True) time_filter = switcher.get(time_range, "DATE(timestamp)") # append the timestamp filter sql = sql_command.sql_num_of_sold_per_prod_by_date.format( f' timestamp BETWEEN "{start}" AND "{end}" ') # if a specific product id is given...... if (prod_id is not None): prod_id = utils.clean_str(prod_id, strip=True) sql += f' WHERE Product.prod_id = "{prod_id}"' sql += f" GROUP BY Product.prod_id, {time_filter} ORDER BY {time_filter}" return gen_response(sql, para, helper_funct=help_prod_num_of_sold_date)
def write_sent_data_pipeline(docs, fdir): group_docs = defaultdict(list) for doc in docs: group_docs[doc.group].append(doc) for group, doc_list in group_docs.items(): fout = open('{}/{}.tsv'.format(fdir, group), 'w') for doc in doc_list: for sent in doc.sents: fout.write('0\t{}\t{}\n'.format(doc.id, utils.clean_str(sent.text)))
def build_sst_matrices(lines): X, y = [], [] for line in lines: words = line.split(' ') label = [0,0] label[int(line[0])] = 1 sent = clean_str(line[1:]) X.append(sent) y.append(label) return (X, y)
def load_or_build_embedding(ds, vocab): # One-hot embedding # embd = eye(len(vocab)) # return embd # Read Word Vectors # word_vector_file = 'data/glove.6B/glove.6B.300d.txt' # word_vector_file = 'data/corpus/' + dataset + '_word_vectors.txt' #_, embd, word_vector_map = loadWord2Vec(word_vector_file) # word_embeddings_dim = len(embd[0]) try: word_vector_file = 'data/corpus/' + ds + '_word_vectors.txt' word_vec_vocab, embd, word_vec_id_map = loadWord2Vec(word_vector_file) word_embeddings_dim = len(embd[0]) # word embedding matrix wm = np.matrix(embd) return word_vec_vocab, wm, word_vec_id_map except: print('Building embedding...') definitions = [] for word in vocab: word = word.strip() synsets = wn.synsets(clean_str(word)) word_defs = [] for synset in synsets: syn_def = synset.definition() word_defs.append(syn_def) word_des = ' '.join(word_defs) if word_des == '': word_des = '<PAD>' definitions.append(word_des) tfidf_vec = TfidfVectorizer(max_features=1000) tfidf_matrix = tfidf_vec.fit_transform(definitions) tfidf_matrix_array = tfidf_matrix.toarray() word_vectors = [] for i in range(len(vocab)): word = vocab[i] vector = tfidf_matrix_array[i] str_vector = [] for j in range(len(vector)): str_vector.append(str(vector[j])) temp = ' '.join(str_vector) word_vector = word + ' ' + temp word_vectors.append(word_vector) string = '\n'.join(word_vectors) f = open('data/corpus/' + ds + '_word_vectors.txt', 'w') f.write(string) f.close() return load_or_build_embedding(ds, vocab)
def write_i_c_data(docs, context_fn, fdir, neg_prob=0.5): group_docs = defaultdict(list) for doc in docs: group_docs[doc.group].append(doc) for group, doc_list in group_docs.items(): fout = open('{}/{}.tsv'.format(fdir, group), 'w') for doc in doc_list: visited_frames = {} for f in doc.frames: k = (f.i.text, f.c.text, f.ev.text) if k not in visited_frames: context_text = utils.clean_str(context_fn(doc, f.i, f.ev)) fout.write('2\t{}\t{}\t{}\n'.format( doc.id, utils.clean_str(f.i.text), context_text)) fout.write('1\t{}\t{}\t{}\n'.format( doc.id, utils.clean_str(f.c.text), context_text)) visited_frames[k] = True if random.random() <= neg_prob: neg_i = get_neg_i(doc, f) fout.write('0\t{}\t{}\t{}\n'.format( doc.id, utils.clean_str(neg_i.text), context_text))
def prod_show_by_id(): # obtain the parameters ## parameters with default values para = def_para() if (para is None): return help_prod_show_by_id() ## user given parameters prod_id = request.args.get("prod_id") if (prod_id is None): return help_prod_show_by_id("No Product ID given!") prod_id = utils.clean_str(prod_id, strip=True) sql = sql_command.sql_product_by_id.format(prod_id) return gen_response(sql, para, helper_funct=help_prod_show_by_id)
def parserNcbiTxtFile_simple(path): logging.info("loadData: {}".format(path)) if opt.nlp_tool == "nltk": nlp_tool = nltk.data.load('tokenizers/punkt/english.pickle') documents = [] id=title=abstractt = "" document = Document() for line in codecs.open(path, 'r', 'utf-8'): line = line.strip() if line != "": linesplits = line.split("|") if len(linesplits) == 3: if linesplits[1] == "t": id = linesplits[0] title = linesplits[2] if linesplits[1] == "a": abstractt = linesplits[2] linesplitsEntity = line.split("\t") if len(linesplitsEntity) == 6: if linesplitsEntity[4] == 'Chemical': continue meshId = linesplitsEntity[len(linesplitsEntity)-1] index = meshId.find(":") if index != -1: meshId = meshId[index+1:] meshId = meshId.strip() entity = Entity() entitytext = clean_str(linesplitsEntity[3]) if opt.use_word2digit: entitytext = wordToDigit(entitytext) entity.setEntity(linesplitsEntity[0],int(linesplitsEntity[1]), int(linesplitsEntity[2]), entitytext,'Disease',meshId.strip()) document.entities.append(entity) else: if len(id)>0 and len(title)>0 and len(abstractt)>0: document.initDocument(id, title, abstractt) # if id == '2234245': # print(id) document_text = title + " " + abstractt sentences = get_sentences_and_tokens_from_nltk(document_text.lower(), nlp_tool, document.entities) document.sents = sentences document.initDocument(id, title, abstractt) documents.append(document) id = title = abstractt = "" document = Document() return documents
def create_mpqa(): mpqa = read_file('raw_datasets/mpqa.all') # build matrices X, y = [], [] for line in mpqa: words = line.split(' ') label = [0,0] label[int(line[0])] = 1 sent = clean_str(line[1:]) X.append(sent) y.append(label) # build vocab mpqa_vocab = Vocabulary(X) print('vocab', len(mpqa_vocab.vocab)) # encode sents max_len = compute_avg_len(X) for i in range(len(X)): X[i] = encode_sent(X[i].split(' '), mpqa_vocab.encoding, max_len) # build embeddings embeddings = [] for name, (emb_vocab, emb_vectors) in embeddings_map.items(): embedding, found = create_embeddings( mpqa_vocab, emb_vocab, emb_vectors, 300 ) embeddings.append(embedding) print('{} - {}'.format(name, found)) w2v_embeddings, glove_embeddings, nb_embeddings = embeddings # shuffle X, y = np.array(X), np.array(y) indices = np.random.permutation(len(X)) X, y = X[indices], y[indices] split_idx = int(len(X) * 0.9) X_train, X_valid = X[:split_idx], X[split_idx:] y_train, y_valid = y[:split_idx], y[split_idx:] print('train', X_train.shape, y_train.shape) print('valid', X_valid.shape, y_valid.shape) # save objects save_object('datasets/mpqa_train', (X_train, y_train)) save_object('datasets/mpqa_valid', (X_valid, y_valid)) save_object('datasets/mpqa_vocab', mpqa_vocab) save_object('datasets/mpqa_w2v_embs', w2v_embeddings) save_object('datasets/mpqa_glove_embs', glove_embeddings) save_object('datasets/mpqa_nb_embs', nb_embeddings)
def cate_purchased_by_cust(): # obtain the parameters ## parameters with default values para = def_para() if (para is None): return help_cate_purchased_by_cust() ## user given parameters cust_id = request.args.get('cust_id') if (cust_id is None): return help_cate_purchased_by_cust(err="Please enter the cust_id!") cust_id = utils.clean_str(str(cust_id), strip=True) sql = sql_command.sql_purchased_by_cust.format(cust_id) return gen_response(sql, para, helper_funct=help_cate_purchased_by_cust)
def loadAbbreviations(abbrePath): abbreviations = list() lines = codecs.open(abbrePath, 'r', 'utf-8') for line in lines: line = line.strip().lower() if line=='': continue linesplits = line.split("\t") abbre = DiseaseAbbreviation() if len(linesplits) < 3: print(line) linesplits[1] = clean_str(linesplits[1]) linesplits[2] = clean_str(linesplits[2]) if opt.use_word2digit: linesplits[1] = wordToDigit(linesplits[1]) linesplits[2] = wordToDigit(linesplits[2]) abbre.initAbbre(linesplits[0].strip(), linesplits[1], linesplits[2]) if abbre not in abbreviations: abbreviations.append(abbre) return abbreviations
def cls_tf_idf(model, label, split): fr = open( "processed_data{}/{}/{}.{}.{}.unable.label".format( modified, task_name, task_name, split, label), 'r') lines = [] for l in fr: lines.append(l) fr.close() fw = open( "processed_data{}/{}/{}.{}.{}.data.label".format( modified, task_name, task_name, split, label), 'a') line_num = min(len(lines), max_line) for i in range(0, line_num, batch_size): batch_range = min(batch_size, line_num - i) batch_lines = lines[i:i + batch_range] batch_x = [clean_str(sent) for sent in batch_lines] pred, attn = model(batch_x) pred = np.argmax(pred.cpu().data.numpy(), axis=1) for line, x, pre, att in zip(batch_lines, batch_x, pred, attn): if len(x) > 0: att = att[:len(x)] if task_name == 'yelp': avg = torch.mean(att) elif task_name == 'amazon': avg = 0.4 else: avg = torch.mean(att) * 0.5 + 0.4 * 0.5 mask = att.gt(avg) if sum(mask).item() == 0: mask = torch.argmax(att).unsqueeze(0) else: mask = torch.nonzero(mask.squeeze()).squeeze(1) idx = mask.cpu().numpy() idx = [int(ix) for ix in idx] contents = [] for i in range(0, len(x)): if i not in idx: contents.append(x[i]) wl = { "content": ' '.join(contents), "line": line.strip(), "masks": list(idx), "label": str(label) } #print(wl) wl_str = json.dumps(wl) fw.write(wl_str) fw.write("\n") fw.close() print("processed over!")
def count_links_from_same_domain(entry): #the purpose of this function is to provide a number that will be used as a delay for the threads that will query the same domain from utils import clean_str kind=entry.get('kind') #t1 for comments t3 for posts data=entry.get('data') if kind=='t3': domain=clean_str(data,['domain']) domains_d[domain] += 1 #title=clean_str(data,['title']) #log( '{:<20.20}... {:>22.22}.. {0}'.format(title, domain,domains_d[domain])) return domain,domains_d[domain] #returns a count of how many domain in domains_d else: return '',0
def reddit_post_worker(idx, entry, q_out): import datetime from utils import strip_emoji, pretty_datediff, clean_str from reddit import determine_if_video_media_from_reddit_json, ret_sub_icon show_listVideos_debug=True credate = "" is_a_video=False title_line2="" t_on = translation(30071) #"on" t_pts='c' thumb_w=0; thumb_h=0 try: kind=entry.get('kind') #t1 for comments t3 for posts data=entry.get('data') post_id=data.get('name') if data: if kind=='t3': title = clean_str(data,['title']) description=clean_str(data,['media','oembed','description']) post_selftext=clean_str(data,['selftext']) description=post_selftext+'[CR]'+description if post_selftext else description else: title=clean_str(data,['link_title']) description=clean_str(data,['body']) title = strip_emoji(title) #an emoji in the title was causing a KeyError u'\ud83c' commentsUrl = urlMain+clean_str(data,['permalink']) try: aaa = data.get('created_utc') credate = datetime.datetime.utcfromtimestamp( aaa ) now_utc = datetime.datetime.utcnow() pretty_date=pretty_datediff(now_utc, credate) credate = str(credate) except (AttributeError,TypeError,ValueError): credate = "" subreddit=clean_str(data,['subreddit']) author=clean_str(data,['author']) domain=clean_str(data,['domain']) num_comments = data.get('num_comments',0) d_url=clean_str(data,['url']) link_url=clean_str(data,['link_url']) media_oembed_url=clean_str(data,['media','oembed','url']) media_url=next((item for item in [d_url,link_url,media_oembed_url] if item ), '') thumb=clean_str(data,['thumbnail']) if not thumb.startswith('http'): #in ['nsfw','default','self']: #reddit has a "default" thumbnail (alien holding camera with "?") thumb="" if thumb=="": thumb=clean_str(data,['media','oembed','thumbnail_url']).replace('&','&') if thumb=="": #use this subreddit's icon if thumb still empty try: thumb=ret_sub_icon(subreddit) except: pass try: preview=data.get('preview')['images'][0]['source']['url'].encode('utf-8').replace('&','&') try: thumb_h = float( data.get('preview')['images'][0]['source']['height'] ) thumb_w = float( data.get('preview')['images'][0]['source']['width'] ) except (AttributeError,TypeError,ValueError): thumb_w=0; thumb_h=0 except Exception as e: thumb_w=0; thumb_h=0; preview="" #a blank preview image will be replaced with poster_url from make_addon_url_from() for domains that support it is_a_video = determine_if_video_media_from_reddit_json(data) over_18=data.get('over_18') title_line2="" title_line2 = "[I][COLOR dimgrey]%s %s [COLOR cadetblue]r/%s[/COLOR] (%d) %s[/COLOR][/I]" %(pretty_date,t_on, subreddit,num_comments, t_pts) if show_listVideos_debug : log(" POST%cTITLE%.2d=%s" %( ("v" if is_a_video else " "), idx, title )) tuple_for_addDirectoryItems=addLink(title=title, title_line2=title_line2, iconimage=thumb, previewimage=preview, preview_w=thumb_w, preview_h=thumb_h, domain=domain, description=description, credate=credate, reddit_says_is_video=is_a_video, commentsUrl=commentsUrl, subreddit=subreddit, media_url=media_url, over_18=over_18, posted_by=author, num_comments=num_comments, post_index=idx, post_id=post_id ) q_out.put( [idx, tuple_for_addDirectoryItems] ) except Exception as e: log( ' #reddit_post_worker EXCEPTION:' + repr(sys.exc_info()) +'--'+ str(e) )
def autoPlay(url, name, autoPlay_type): import random from domains import sitesBase, parse_reddit_link, build_DirectoryItem_url_based_on_media_type from utils import unescape, post_is_filtered_out, log, clean_str from actions import setting_gif_repeat_count from reddit import reddit_request, determine_if_video_media_from_reddit_json #collect a list of title and urls as entries[] from the j_entries obtained from reddit #then create a playlist from those entries #then play the playlist gif_repeat_count=setting_gif_repeat_count() entries = [] playlist = xbmc.PlayList(xbmc.PLAYLIST_VIDEO) playlist.clear() log("**********autoPlay %s*************" %autoPlay_type) content = reddit_request(url) if not content: return content = json.loads(content.replace('\\"', '\'')) log("Autoplay %s - Parsing %d items" %( autoPlay_type, len(content['data']['children']) ) ) for j_entry in content['data']['children']: try: if post_is_filtered_out( j_entry ): continue title = clean_str(j_entry, ['data','title']) try: media_url = j_entry['data']['url'] except: media_url = j_entry['data']['media']['oembed']['url'] is_a_video = determine_if_video_media_from_reddit_json(j_entry) #log(" Title:%s -%c %s" %( title, ("v" if is_a_video else " "), media_url ) ) #hoster, DirectoryItem_url, videoID, mode_type, thumb_url,poster_url, isFolder,setInfo_type, IsPlayable=make_addon_url_from(media_url,is_a_video) ld=parse_reddit_link(link_url=media_url, assume_is_video=is_a_video, needs_preview=False, get_playable_url=True ) DirectoryItem_url, setProperty_IsPlayable, isFolder, title_prefix = build_DirectoryItem_url_based_on_media_type(ld, media_url, title, on_autoplay=True) if ld: if ld.media_type not in [sitesBase.TYPE_VIDEO, sitesBase.TYPE_GIF, sitesBase.TYPE_VIDS, sitesBase.TYPE_MIXED]: continue autoPlay_type_entries_append( entries, autoPlay_type, title, DirectoryItem_url) if ld.media_type == sitesBase.TYPE_GIF: for _ in range( 0, gif_repeat_count ): autoPlay_type_entries_append( entries, autoPlay_type, title, DirectoryItem_url) except Exception as e: log(" EXCEPTION Autoplay "+ str( sys.exc_info()[0]) + " " + str(e) ) #def k2(x): return x[1] #entries=remove_duplicates(entries, k2) if autoplayRandomize: random.shuffle(entries) #for title, url in entries: # log(" added to playlist:"+ title + " " + urllib.unquote_plus(url) ) for title, url in entries: listitem = xbmcgui.ListItem(title) playlist.add(url, listitem) log('add to playlist: %s %s' %(title.ljust(25)[:25],url )) xbmc.Player().play(playlist)
def reddit_post_worker(idx, entry, q_out, delay=0): import datetime from utils import pretty_datediff, clean_str, get_int, format_description from reddit import determine_if_video_media_from_reddit_json from domains import sitesBase if delay>0: xbmc.Monitor().waitForAbort( float(delay)/1000 ) #xbmc.sleep(delay) try: credate = "" is_a_video=False title_line2="" thumb_w=0; thumb_h=0 t_on = translation(32071) #"on" #t_pts = u"\U0001F4AC" # translation(30072) #"cmnts" comment bubble symbol. doesn't work t_pts = u"\U00002709" # translation(30072) envelope symbol t_up = u"\U000025B4" #u"\U00009650"(up arrow) #upvote symbol #on 3/21/2017 we're adding a new feature that lets users view their saved posts by entering /user/username/saved as their subreddit. # in addition to saved posts, users can also save comments. we need to handle it by checking for "kind" kind=entry.get('kind') #t1 for comments t3 for posts data=entry.get('data') if data: if kind=='t3': title = clean_str(data,['title']) description=clean_str(data,['media','oembed','description']) post_selftext=clean_str(data,['selftext']) description=post_selftext+'\n'+description if post_selftext else description domain=clean_str(data,['domain']) else: title=clean_str(data,['link_title']) description=clean_str(data,['body']) domain='Comment post' description=format_description(description, hide_text_in_parens=False) first_link_in_description=None #title=strip_emoji(title) #an emoji in the title was causing a KeyError u'\ud83c' title=format_description(title) is_a_video = determine_if_video_media_from_reddit_json(entry) log(" POS%s%cTITLE%.2d=%s d=%d" %( kind, ("v" if is_a_video else " "), idx, title,delay )) #log("description%.2d=%s" %(idx,description)) post_id = entry['kind'] + '_' + data.get('id') #same as entry['data']['name'] #log(' %s %s ' % (post_id, entry['data']['name'] )) commentsUrl = urlMain+clean_str(data,['permalink']) #log("commentsUrl"+str(idx)+"="+commentsUrl) try: aaa = data.get('created_utc') credate = datetime.datetime.utcfromtimestamp( aaa ) now_utc = datetime.datetime.utcnow() pretty_date=pretty_datediff(now_utc, credate) credate = str(credate) except (AttributeError,TypeError,ValueError): credate = "" subreddit=clean_str(data,['subreddit']) author=clean_str(data,['author']) #log(" DOMAIN%.2d=%s" %(idx,domain)) #post_excluded_from() is a misnomer. it just returns true if subreddit is in csv-list if (post_excluded_from( use_first_link_in_textpost_for_the_following_subreddits, subreddit) or post_excluded_from( use_first_link_in_textpost_for_the_following_subreddits, 'all') and domain.startswith('self.')): first_link_in_description=sitesBase.get_first_url_from(description) #override the domain so that bottom right of gui matches link if first_link_in_description: domain = '({uri.netloc})'.format( uri=urlparse.urlparse( first_link_in_description ) ) ups = data.get('score',0) #downs not used anymore num_comments = data.get('num_comments',0) d_url=clean_str(data,['url']) link_url=clean_str(data,['link_url']) media_oembed_url=clean_str(data,['media','oembed','url']) # log(' kind ='+kind) # log(' url ='+d_url) # log(' link_url='+link_url) # log(' permalink='+clean_str(data,['permalink'])) # log(' media_oembed_url='+media_oembed_url) media_url=next((item for item in [first_link_in_description,d_url,link_url,media_oembed_url] if item ), '') #log(" MEDIA%.2d=%s" %(idx,media_url)) thumb=clean_str(data,['thumbnail']) #media_w=get_int(data,['media','oembed','width']) #media_h=get_int(data,['media','oembed','height']) #log(' media_w='+repr(media_w)+' h='+repr(media_h) ) #try:log(' media_w='+repr(data.get('media')['oembed']['width'] ) ) #except:pass if not thumb.startswith('http'): #in ['nsfw','default','self']: #reddit has a "default" thumbnail (alien holding camera with "?") thumb="" if thumb=="": thumb=clean_str(data,['media','oembed','thumbnail_url']).replace('&','&') #a blank preview image will be replaced with poster_url from parse_reddit_link() for domains that support it preview=clean_str(data,['preview','images',0,'source','url']).replace('&','&') #data.get('preview')['images'][0]['source']['url'].encode('utf-8').replace('&','&') #log(' preview='+repr(preview)) #try: thumb_h=get_int(data,['preview','images',0,'source','height'])#float( data.get('preview')['images'][0]['source']['height'] ) thumb_w=get_int(data,['preview','images',0,'source','width']) #float( data.get('preview')['images'][0]['source']['width'] ) #except (AttributeError,TypeError,ValueError): #log(" thumb_w _h EXCEPTION:="+ str( sys.exc_info()[0]) + " " + str(e) ) # thumb_w=0; thumb_h=0 #preview images are 'keep' stretched to fit inside 1080x1080. # if preview image is smaller than the box we have for thumbnail, we'll use that as thumbnail and not have a bigger stretched image if thumb_w > 0 and thumb_w < 280: #log('*******preview is small ') thumb=preview thumb_w=0; thumb_h=0; preview="" over_18=data.get('over_18') title_line2="" title_line2 = "[I][COLOR dimgrey]%d%c %s %s [B][COLOR cadetblue]r/%s[/COLOR][/B] (%d) %s[/COLOR][/I]" %(ups,t_up,pretty_date,t_on, subreddit,num_comments, t_pts) liz=addLink(title=title, title_line2=title_line2, iconimage=thumb, previewimage=preview, preview_w=thumb_w, preview_h=thumb_h, domain=domain, description=description, credate=credate, reddit_says_is_video=is_a_video, commentsUrl=commentsUrl, subreddit=subreddit, link_url=media_url, over_18=over_18, posted_by=author, num_comments=num_comments, post_id=post_id, ) q_out.put( [idx, liz] ) #we put the idx back for easy sorting except Exception as e: log( ' #reddit_post_worker EXCEPTION:' + repr(sys.exc_info()) +'--'+ str(e) )
def listLinksInComment(url, name, type_): from guis import progressBG from reddit import reddit_request from utils import clean_str,remove_duplicates, is_filtered from default import comments_link_filter log('listLinksInComment:%s:%s' %(type_,url) ) post_title='' global harvest # ShowOnlyCommentsWithlink=False # if type_=='linksOnly': # ShowOnlyCommentsWithlink=True #url='https://np.reddit.com/r/videos/comments/64j9x7/doctor_violently_dragged_from_overbooked_cia/dg2pbtj/?st=j1cbxsst&sh=2d5daf4b' #url=url.split('?', 1)[0]+'.json'+url.split('?', 1)[1] #log(repr(url.split('?', 1)[0])) #log(repr(url.split('?', 1)[1])) #log(repr(url.split('?', 1)[0]+'.json?'+url.split('?', 1)[1])) #url='https://www.reddit.com/r/Music/comments/4k02t1/bonnie_tyler_total_eclipse_of_the_heart_80s_pop/' + '.json' #only get up to "https://www.reddit.com/r/Music/comments/4k02t1". # do not include "/bonnie_tyler_total_eclipse_of_the_heart_80s_pop/" # because we'll have problem when it looks like this: "https://www.reddit.com/r/Overwatch/comments/4nx91h/ever_get_that_feeling_déjà _vu/" #url=re.findall(r'(.*/comments/[A-Za-z0-9]+)',url)[0] #UPDATE you need to convert this: https://www.reddit.com/r/redditviewertesting/comments/4x8v1k/test_test_what_is_déjà_vu/ # to this: https://www.reddit.com/r/redditviewertesting/comments/4x8v1k/test_test_what_is_d%C3%A9j%C3%A0_vu/ # #use safe='' argument in quoteplus to encode only the weird chars part url=urllib.quote_plus(url,safe=':/?&') if '?' in url: url=url.split('?', 1)[0]+'.json?'+url.split('?', 1)[1] else: url+= '.json' xbmc_busy() loading_indicator=progressBG('Loading...') loading_indicator.update(0,'Retrieving comments') content = reddit_request(url) loading_indicator.update(10,'Parsing') if not content: loading_indicator.end() return try: xbmc_busy() content = json.loads(content) #harvest links in the post text (just 1) r_linkHunter(content[0]['data']['children']) #submitter=content[0]['data']['children'][0]['data']['author'] submitter=clean_str(content,[0,'data','children',0,'data','author']) #the post title is provided in json, we'll just use that instead of messages from addLink() #post_title=content[0]['data']['children'][0]['data']['title'] post_title=clean_str(content,[0,'data','children',0,'data','title']) #harvest links in the post itself r_linkHunter(content[1]['data']['children']) #for i, h in enumerate(harvest): # log( ' %d %s %.4d -%s link[%s]' % ( i, h[7].ljust(8)[:8], h[0], h[3].ljust(20)[:20],h[2] ) ) comments_count_orig=len(harvest) #log(' len harvest1 '+repr(len(harvest))) #remove duplicate links def k2(x): return (x[2],x[3]) harvest=remove_duplicates(harvest,k2) comments_count_rd=len(harvest) #log(' len harvest2 '+repr(len(harvest))) loading_indicator.update(15,'Removed %d duplicates' %(comments_count_orig-comments_count_rd) ) c_threads=[] q_liz=Queue() comments_count=len(harvest) filtered_posts=0 for idx, h in enumerate(harvest): comment_score=h[0] link_url=h[2] if comment_score < int_CommentTreshold: log(' comment score %d < %d, skipped' %(comment_score,int_CommentTreshold) ) filtered_posts+=1 continue if is_filtered(comments_link_filter,link_url): log(' [{0}] is hidden by comments_link_filter'.format(link_url)) filtered_posts+=1 continue domain,domain_count=count_links_from_same_domain_comments(link_url) #count how many same domains we're hitting delay=compute_anti_dos_delay(domain,domain_count) #have threads process each comment post t = threading.Thread(target=reddit_comment_worker, args=(idx, h,q_liz,submitter,delay), name='#t%.2d'%idx) c_threads.append(t) t.start() #loading_indicator.update(20,'Filtered %d comments' %(filtered_posts) ) log(repr(domains_d)) #check the queue to determine progress break_counter=0 #to avoid infinite loop expected_listitems=(comments_count-filtered_posts) if expected_listitems>0: loading_indicator.set_tick_total(expected_listitems) last_queue_size=0 while q_liz.qsize() < expected_listitems: if break_counter>=100: break #each change in the queue size gets a tick on our progress track if last_queue_size < q_liz.qsize(): items_added=q_liz.qsize()-last_queue_size loading_indicator.tick(items_added,'Parsing') else: break_counter+=1 last_queue_size=q_liz.qsize() xbmc.sleep(50) #wait for all threads to finish before collecting the list items for idx, t in enumerate(c_threads): #log(' joining %s' %t.getName()) t.join(timeout=20) xbmc_busy(False) #compare the number of entries to the returned results #log( "queue:%d entries:%d" %( q_liz.qsize() , len(content['data']['children'] ) ) ) if q_liz.qsize() != expected_listitems: log('some threads did not return a listitem. total comments:%d expecting(%d) but only got(%d)' %(comments_count, expected_listitems, q_liz.qsize())) #for t in threads: log('isAlive %s %s' %(t.getName(), repr(t.isAlive()) ) ) li=[ liz for idx,liz in sorted(q_liz.queue) ] #log(repr(li)) with q_liz.mutex: q_liz.queue.clear() except Exception as e: log(' ' + str(e) ) loading_indicator.end() #it is important to close xbmcgui.DialogProgressBG # this portion is abandoned for now. initial plan was to textbox with auto-height in a grouplist to mimic the comment tree but cannot figure out how links can be followed. from guis import comments_GUI2 ui = comments_GUI2('view_464_comments_grouplist.xml' , addon_path, defaultSkin='Default', defaultRes='1080i', listing=li, id=55) #ui = comments_GUI2('aaa.xml' , addon_path, defaultSkin='Default', defaultRes='1080i', listing=li, id=55) ui.title_bar_text=post_title ui.doModal() del ui return
def reddit_post_worker(idx, entry, q_out): import datetime from utils import strip_emoji, pretty_datediff, clean_str from reddit import determine_if_video_media_from_reddit_json, ret_sub_icon show_listVideos_debug=True credate = "" is_a_video=False title_line2="" t_on = translation(30071) #"on" #t_pts = u"\U0001F4AC" # translation(30072) #"cmnts" comment bubble symbol. doesn't work #t_pts = u"\U00002709" # translation(30072) envelope symbol t_pts='c' thumb_w=0; thumb_h=0 try: #on 3/21/2017 we're adding a new feature that lets users view their saved posts by entering /user/username/saved as their subreddit. # in addition to saved posts, users can also save comments. we need to handle it by checking for "kind" kind=entry.get('kind') #t1 for comments t3 for posts data=entry.get('data') post_id=data.get('name') if data: if kind=='t3': title = clean_str(data,['title']) description=clean_str(data,['media','oembed','description']) post_selftext=clean_str(data,['selftext']) description=post_selftext+'[CR]'+description if post_selftext else description else: title=clean_str(data,['link_title']) description=clean_str(data,['body']) title = strip_emoji(title) #an emoji in the title was causing a KeyError u'\ud83c' commentsUrl = urlMain+clean_str(data,['permalink']) #if show_listVideos_debug :log("commentsUrl"+str(idx)+"="+commentsUrl) try: aaa = data.get('created_utc') credate = datetime.datetime.utcfromtimestamp( aaa ) now_utc = datetime.datetime.utcnow() pretty_date=pretty_datediff(now_utc, credate) credate = str(credate) except (AttributeError,TypeError,ValueError): credate = "" subreddit=clean_str(data,['subreddit']) author=clean_str(data,['author']) domain=clean_str(data,['domain']) #log(" DOMAIN%.2d=%s" %(idx,domain)) #ups = data.get('score',0) #downs not used anymore num_comments = data.get('num_comments',0) #description = "[COLOR blue]r/"+ subreddit + "[/COLOR] [I]" + str(ups)+" pts | "+str(comments)+" cmnts | by "+author+"[/I]\n"+description #description = "[COLOR blue]r/"+ subreddit + "[/COLOR] [I]" + str(ups)+" pts. | by "+author+"[/I]\n"+description #description = title_line2+"\n"+description #if show_listVideos_debug :log("DESCRIPTION"+str(idx)+"=["+description+"]") d_url=clean_str(data,['url']) link_url=clean_str(data,['link_url']) media_oembed_url=clean_str(data,['media','oembed','url']) media_url=next((item for item in [d_url,link_url,media_oembed_url] if item ), '') #log(" url"+str(idx)+"="+media_url) thumb=clean_str(data,['thumbnail']) #if show_listSubReddit_debug : log(" THUMB%.2d=%s" %( idx, thumb )) if not thumb.startswith('http'): #in ['nsfw','default','self']: #reddit has a "default" thumbnail (alien holding camera with "?") thumb="" if thumb=="": thumb=clean_str(data,['media','oembed','thumbnail_url']).replace('&','&') if thumb=="": #use this subreddit's icon if thumb still empty try: thumb=ret_sub_icon(subreddit) except: pass try: #collect_thumbs(entry) preview=data.get('preview')['images'][0]['source']['url'].encode('utf-8').replace('&','&') #poster = entry['data']['media']['oembed']['thumbnail_url'].encode('utf-8') #t=thumb.split('?')[0] #can't preview gif thumbnail on thumbnail view, use alternate provided by reddit #if t.endswith('.gif'): #log(' thumb ends with .gif') # thumb = entry['data']['thumbnail'].encode('utf-8') try: thumb_h = float( data.get('preview')['images'][0]['source']['height'] ) thumb_w = float( data.get('preview')['images'][0]['source']['width'] ) except (AttributeError,TypeError,ValueError): thumb_w=0; thumb_h=0 except Exception as e: #log(" getting preview image EXCEPTION:="+ str( sys.exc_info()[0]) + " " + str(e) ) thumb_w=0; thumb_h=0; preview="" #a blank preview image will be replaced with poster_url from make_addon_url_from() for domains that support it is_a_video = determine_if_video_media_from_reddit_json(data) over_18=data.get('over_18') #setting: toggle showing 2-line title #log(" TitleAddtlInfo "+str(idx)+"="+str(TitleAddtlInfo)) title_line2="" #if TitleAddtlInfo: #title_line2 = "[I][COLOR dimgrey]%s by %s [COLOR darkslategrey]r/%s[/COLOR] %d pts.[/COLOR][/I]" %(pretty_date,author,subreddit,ups) #title_line2 = "[I][COLOR dimgrey]"+pretty_date+" by "+author+" [COLOR darkslategrey]r/"+subreddit+"[/COLOR] "+str(ups)+" pts.[/COLOR][/I]" title_line2 = "[I][COLOR dimgrey]%s %s [COLOR cadetblue]r/%s[/COLOR] (%d) %s[/COLOR][/I]" %(pretty_date,t_on, subreddit,num_comments, t_pts) #title_line2 = "[I]"+str(idx)+". [COLOR dimgrey]"+ media_url[0:50] +"[/COLOR][/I] " # +" "+" [COLOR darkslategrey]r/"+subreddit+"[/COLOR] "+str(ups)+" pts.[/COLOR][/I]" #if show_listVideos_debug : log( ("v" if is_a_video else " ") +" TITLE"+str(idx)+"="+title) if show_listVideos_debug : log(" POST%cTITLE%.2d=%s" %( ("v" if is_a_video else " "), idx, title )) #if show_listVideos_debug :log(" OVER_18"+str(idx)+"="+str(over_18)) #if show_listVideos_debug :log(" IS_A_VIDEO"+str(idx)+"="+str(is_a_video)) #if show_listVideos_debug :log(" THUMB"+str(idx)+"="+thumb) #if show_listVideos_debug :log(" MediaURL%.2d=%s" % (idx,media_url) ) #if show_listVideos_debug :log(" HOSTER"+str(idx)+"="+hoster) #log(" VIDEOID"+str(idx)+"="+videoID) #log( "["+description+"]1["+ str(date)+"]2["+ str( count)+"]3["+ str( commentsUrl)+"]4["+ str( subreddit)+"]5["+ video_url +"]6["+ str( over_18))+"]" tuple_for_addDirectoryItems=addLink(title=title, title_line2=title_line2, iconimage=thumb, previewimage=preview, preview_w=thumb_w, preview_h=thumb_h, domain=domain, description=description, credate=credate, reddit_says_is_video=is_a_video, commentsUrl=commentsUrl, subreddit=subreddit, media_url=media_url, over_18=over_18, posted_by=author, num_comments=num_comments, post_index=idx, post_id=post_id ) q_out.put( [idx, tuple_for_addDirectoryItems] ) except Exception as e: log( ' #reddit_post_worker EXCEPTION:' + repr(sys.exc_info()) +'--'+ str(e) )