def get_context_data(self, **kwargs): context = super(IndexJsonView, self).get_context_data(**kwargs) stories = context['stories'] context = self.clean_context(context) context['stories'] = [] for story in stories: story_json = { 'id': story.id, 'title': story.title, 'selfpost': story.selfpost, 'poll': story.poll, 'username': story.username, 'score': story.score, 'comments': story.comments, 'story_type': story.story_type, 'time': format(story.time, 'r'), 'time_unix': format(story.time, 'U'), 'cache': format(story.cache, 'r'), 'cache_unix': format(story.cache, 'U') } if not story.selfpost: story_json['url'] = story.url story_json['domain'] = domain(story.url) context['stories'].append(story_json) context['page'] = {'current': stories.number, 'total': stories.paginator.num_pages} return context
def get_context_data(self, **kwargs): context = super(IndexJsonView, self).get_context_data(**kwargs) stories = context['stories'] context = self.clean_context(context) context['stories'] = [] for story in stories: story_json = { 'id': story.id, 'title': story.title, 'selfpost': story.selfpost, 'poll': story.poll, 'username': story.username, 'score': story.score, 'comments': story.comments, 'story_type': story.story_type, 'time': format(story.time, 'r'), 'time_unix': format(story.time, 'U'), 'cache': format(story.cache, 'r'), 'cache_unix': format(story.cache, 'U') } if not story.selfpost: story_json['url'] = story.url story_json['domain'] = domain(story.url) context['stories'].append(story_json) context['page'] = { 'current': stories.number, 'total': stories.paginator.num_pages } return context
def fix_url(cls, url): id_ = get_id(url) if re.find(r'^https?://', id_): return url if re.find(r'^https?://', url): domain = utils.domain(url) else: domain = 'www.nicovideo.jp' return 'https://{}/watch/{}'.format(domain, id_)
def read_domain_data(filepath, domain_col, data_cols_to_read, delimiter, skip_rows): domains = {} row_count = 0 # for debugging with open(filepath, 'r') as f: reader = csv.reader(f, delimiter=delimiter) # skip headers if skip_rows is not None: for _ in range(skip_rows): row_count += 1 next(reader) # process the rows for row in reader: if domain_col >= len(row): raise ValueError('Invalid domain index: {}, {}'.format( domain_col, ', '.join(row))) d = domain(row[domain_col]) if d in domains: logging.info( 'Domain has already been processed: {}. Skipping new values.' .format(d)) logging.info('Existing data: {}'.format(', '.join(domains[d]))) logging.info('New data: {}'.format(', '.join(row))) new_row = [] if data_cols_to_read is not None: for idx in data_cols_to_read: if idx >= len(row): raise ValueError('Invalid index: {}, {}'.format( idx, ', '.join(row))) elif idx == domain_col: logging.info( 'Data column the same as the domain column. Skipping.' ) else: new_row.append(row[idx]) row_count += 1 domains[d] = new_row return domains
def get_context_data(self, **kwargs): context = super(CommentsJsonView, self).get_context_data(**kwargs) story = context.get('story', None) polls = context.get('polls', None) total_votes = context.get('total_votes', None) root_comments = cache_tree_children(context.get('nodes', None)) context = self.clean_context(context) if story: context['story'] = { 'id': story.id, 'title': story.title, 'selfpost': story.selfpost, 'poll': story.poll, 'score': story.score, 'username': story.username, 'time': format(story.time, 'r'), 'time_unix': format(story.time, 'U'), 'comments': story.comments, 'cache': format(story.cache, 'r'), 'cache_unix': format(story.cache, 'U') } if story.selfpost: context['story']['selfpost_text'] = story.selfpost_text else: context['story']['url'] = story.url context['story']['domain'] = domain(story.url) if polls: context['polls'] = [] for poll in polls: context['polls'].append({ 'name': poll.name, 'votes': poll.score, 'percentage': poll_percentage(poll.score, total_votes, 2) }) context['comments'] = [] for root_comment in root_comments: context['comments'].append( self.recursive_node_to_dict(root_comment, bool(story))) return context
def prepare_context(self): story = self.context.get('story') polls = self.context.get('polls') total_votes = self.context.get('total_votes') root_comments = self.list_to_nested(self.context.get('comments', [])) self.clean_context() if story: self.context['story'] = { 'id': story.id, 'title': story.title, 'selfpost': story.selfpost, 'poll': story.poll, 'score': story.score, 'username': story.username, 'time': format(story.time, 'r'), 'time_unix': format(story.time, 'U'), 'comments': story.comments, 'cache': format(story.cache, 'r'), 'cache_unix': format(story.cache, 'U') } if story.selfpost: self.context['story']['selfpost_text'] = story.selfpost_text else: self.context['story']['url'] = story.url self.context['story']['domain'] = utils.domain(story.url) if story.dead: self.context['story']['dead'] = True if polls: self.context['polls'] = [] for poll in polls: self.context['polls'].append({ 'name': poll.name, 'votes': poll.score, 'percentage': utils.poll_percentage(poll.score, total_votes, 2) }) self.context['comments'] = [] # recursive_comment_to_dict and list_to_nested could be combined to reduce looping for root_comment in root_comments: self.context['comments'].append(self.recursive_comment_to_dict(root_comment, bool(story))) return self.context
def get_context_data(self, **kwargs): context = super(CommentsJsonView, self).get_context_data(**kwargs) story = context.get('story', None) polls = context.get('polls', None) total_votes = context.get('total_votes', None) root_comments = cache_tree_children(context.get('nodes', None)) context = self.clean_context(context) if story: context['story'] = { 'id': story.id, 'title': story.title, 'selfpost': story.selfpost, 'poll': story.poll, 'score': story.score, 'username': story.username, 'time': format(story.time, 'r'), 'time_unix': format(story.time, 'U'), 'comments': story.comments, 'cache': format(story.cache, 'r'), 'cache_unix': format(story.cache, 'U') } if story.selfpost: context['story']['selfpost_text'] = story.selfpost_text else: context['story']['url'] = story.url context['story']['domain'] = domain(story.url) if polls: context['polls'] = [] for poll in polls: context['polls'].append({ 'name': poll.name, 'votes': poll.score, 'percentage': poll_percentage(poll.score, total_votes, 2) }) context['comments'] = [] for root_comment in root_comments: context['comments'].append(self.recursive_node_to_dict(root_comment, bool(story))) return context
def prepare_context(self): stories = self.context['stories'] self.clean_context() self.context['stories'] = [] for story in stories: story_json = { 'id': story.id, 'title': story.title, 'selfpost': story.selfpost, 'poll': story.poll, 'username': story.username, 'score': story.score, 'comments': story.comments, 'story_type': story.story_type, 'time': format(story.time, 'r'), 'time_unix': format(story.time, 'U'), 'cache': format(story.cache, 'r'), 'cache_unix': format(story.cache, 'U') } if not story.selfpost: story_json['url'] = story.url story_json['domain'] = utils.domain(story.url) self.context['stories'].append(story_json) self.context['page'] = {'current': stories.number, 'total': stories.paginator.num_pages}
with matchfile(match) as fw: fw.write(str(match) + '\n') if 0 < match.tostart < 3600 and not tbd in match.teams: pool[match.webpage] = match except KeyboardInterrupt: sys.exit(0) except Exception as e: cooldown[crawler] = time.time() + cd print('{0} cooldowned for {1}s for exception'.format(crawler, cd)) print(e) cd += 0 with open('httpalias', 'a') as fw: G = networkx.Graph() print('{0} matches start <1h'.format(len(G))) for s1, s2 in itertools.combinations(pool.values(), 2): if domain(s1) == domain(s2) or not s1 == s2: continue G.add_edge(s1.webpage, s2.webpage) for idx, c in enumerate(networkx.connected_components(G)): fw.write(' '.join(c) + '\n') print('###MATCHED### {0}: '.format(idx) + ' '.join(c)) profit = (max([pool[w].returns[0] for w in c]) - 1) * (max([pool[w].returns[1] for w in c]) - 1) - 1 if profit < profit_trs: continue print(red(' PROFIT {0:.3}'.format(profit))) for w in c: s = pool[w] print( red(' {0} {1} ({2}): {3}/{4} {5} {6}'.format( s.teams[0], s.teams[1], s.series, s.returns,
def get_videos(url, cw=None): ''' get_videos ''' print_ = get_print(cw) if '/users/' in url: mode = 'users' username = url.split('/users/')[1].split('/')[0] elif '/pornstar/' in url: mode = 'pornstar' username = url.split('/pornstar/')[1].split('/')[0] elif '/model/' in url: mode = 'model' username = url.split('/model/')[1].split('/')[0] elif '/channels/' in url: mode = 'channels' username = url.split('/channels/')[1].split('/')[0] elif '/playlist/' in url: mode = 'playlist' username = url.split('/playlist/')[1].split('/')[0] else: raise Exception('Not supported url') username = username.split('?')[0].split('#')[0] session = Session() domain = utils.domain(url) if mode in ['pornstar']: url_main = 'https://{}/{}/{}'.format(domain, mode, username) html = downloader.read_html(url_main, session=session) soup = Soup(html) soup = fix_soup(soup, url_main, session, cw) for a in soup.findAll('a'): if '/{}/{}/videos/upload'.format(mode, username) in a.attrs.get('href', ''): free = True break else: free = False print_('free: {}'.format(free)) # Range max_pid = get_max_range(cw, 500) max_pid = min(max_pid, 2000)# html = downloader.read_html(url, session=session) soup = fix_soup(Soup(html), url, session, cw) info = {} # get title h1 = soup.find('h1') if h1: header = 'Playlist' title = h1.find(id='watchPlaylist') else: title = None if not title: header = 'Channel' profile = soup.find('div', class_='profileUserName') wrapper = soup.find('div', class_='titleWrapper') bio = soup.find('div', class_='withBio') title = soup.find('h1', {'itemprop':'name'}) if not title and profile: title = profile.a if not title and wrapper: title = wrapper.h1 if not title and bio: title = bio.h1 if not title: raise Exception('No title') #print(title) info['title'] = '[{}] {}'.format(header, title.text.strip()) token = re.find('''token *= *['"](.*?)['"]''', html) print_('token: {}'.format(token)) # get links hrefs = [] fail = 0 for p in range(1, 1+100): try: if mode in ['users', 'model']: if mode == 'users': url_api = 'https://{}/users/{}/videos/public/'\ 'ajax?o=mr&page={}'.format(domain, username, p) elif mode == 'model': url_api = 'https://{}/model/{}/videos/upload/'\ 'ajax?o=mr&page={}'.format(domain, username, p) r = session.post(url_api) soup = Soup(r.text) if soup.find('h1'): print('break: h1') break elif mode in ['pornstar']: if free: url_api = 'https://{}/{}/{}/videos/upload'\ '?page={}'.format(domain, mode, username, p) soup = downloader.read_soup(url_api, session=session) soup = fix_soup(soup, url_api, session, cw) soup = soup.find('div', class_='videoUList') else: url_api = 'https://{}/{}/{}?page={}'.format(domain, mode, username, p) soup = downloader.read_soup(url_api, session=session) soup = fix_soup(soup, url_api, session, cw) soup = soup.find('ul', class_='pornstarsVideos') elif mode in ['channels']: url_api = 'https://{}/{}/{}/videos?page={}'.format(domain, mode, username, p) soup = downloader.read_soup(url_api, session=session) soup = fix_soup(soup, url_api, session, cw) try: soup = soup.find('div', {'id': 'channelsBody'}).find('div', class_='rightSide') except: break elif mode in ['playlist']: #url_api = 'https://{}/playlist/viewChunked?id={}&offset={}&itemsPerPage=40'.format(domain, username, len(hrefs)) if token is None: raise Exception('no token') url_api = 'https://{}/playlist/viewChunked?id={}&token={}&page={}'.format(domain, username, token, p) soup = downloader.read_soup(url_api, session=session) else: raise NotImplementedError(mode) fail = 0 except Exception as e: print_(e) fail += 1 if fail < 2: continue else: break finally: print_('{} ({})'.format(url_api, len(hrefs))) if cw and not cw.alive: return lis = soup.findAll('li', class_='videoblock') if not lis: print_('break: no lis') break if getattr(soup.find('title'), 'text', '').strip() == 'Page Not Found': print_('Page Not Found') break c = 0 for li in lis: a = li.find('a') href = a.attrs['href'] href = urljoin(url, href) if href in hrefs: continue c += 1 if href.startswith('javascript:'): # Remove Pornhub Premium print(href) continue hrefs.append(href) if c == 0: print('c==0') break print(c) # 1320 if len(hrefs) >= max_pid: break if cw: hrefs = filter_range(hrefs, cw.range) info['hrefs'] = hrefs return info
def suitable(url): if domain(url.lower(), 2) not in ['weibo.com', 'weibo.cn']: return False if '/tv/' in url.lower(): return False return True
def run_uci_experiments(method, get_base_model, get_encoder, get_task, C, C_w, epochs, batch_size, n_models, n_jobs, n_target_labeled, random_state, save, **kwargs): """ Run experiments on superconductivity dataset Parameters ---------- method: str name of the method used: should one of the following: - NoReweight - TrAdaBoost - WANN - SrcOnly - TgtOnly get_base_model: callable constructor for the base learner, should takes C, shape, activation and name as arguments get_encoder: callable constructor for the DANN encoder network get_task: callable constructor for the DANN task network C: float projecting constant for networks (args of get_base_model) C_w: float projecting constant for WANN weighting network lambda_: float DANN trade-off parameter epochs: int number of epochs batch_size: int size of the batches n_models: int number of bagged models n_jobs: int number of jobs to run in parallel, if n_jobs=None no paralllel computing is done. n_target_labeled: int number of training target labeled data random_state: int seed number of the experiment save: boolean whether to save results in csv or not Returns ------- df: DataFrame dataframe containing mse scores """ print("Experiment for method: %s" % method) print("\n") folder = os.path.dirname(__file__) save_path = folder + "/../dataset/results/" + "uci_" + method + "_" + str( random_state) df = pd.DataFrame(columns=['state', 'method', 'source', 'target', 'score']) if save: try: df.to_csv(save_path + ".csv") except: try: os.mkdir(folder + "/../dataset/results") except: os.mkdir(folder + "/../dataset") os.mkdir(folder + "/../dataset/results") df.to_csv(save_path + ".csv") for source in [0, 1, 2, 3]: print("############# " + str(source) + " #############") target_list = [0, 1, 2, 3] target_list.remove(source) for target in target_list: print("--------- %s ----------" % str(target)) data, X, y, cuts, split_col = superconduct() shape = X.shape[1] src_index = domain(data, cuts, split_col, source) tgt_index = domain(data, cuts, split_col, target) np.random.seed(0) tgt_train_index, tgt_test_index = train_test_split( tgt_index, train_size=n_target_labeled) train_index = np.concatenate((src_index, tgt_train_index)) std_sc = StandardScaler() std_sc.fit(X[train_index]) X = std_sc.transform(X) y = (y - y[train_index].mean()) / y[train_index].std() base_estimator = BaggingModels(func=get_base_model, n_models=n_models, n_jobs=n_jobs, shape=shape, C=C, random_state=random_state) fit_params = dict(epochs=epochs, batch_size=batch_size, verbose=0) if method == "SrcOnly": model = copy.deepcopy(base_estimator) model.fit(X[src_index], y[src_index], **fit_params) if method == "TgtOnly": model = copy.deepcopy(base_estimator) model.fit(X[tgt_train_index], y[tgt_train_index], **fit_params) if method == "NoReweight": model = copy.deepcopy(base_estimator) model.fit(X[train_index], y[train_index], **fit_params) if method == "TrAdaBoost": model = TwoStageTrAdaBoostR2(func=get_base_model, random_state=random_state, n_jobs=n_jobs, C=C, shape=X.shape[1]) model.fit(X, y, [src_index, tgt_train_index], **fit_params) if method == "WANN": model = BaggingModels(WANN, get_base_model=get_base_model, C=C, C_w=C_w, n_models=n_models, n_jobs=n_jobs, random_state=random_state) model.fit(X, y, index=[src_index, tgt_train_index], **fit_params) if method == "DANN": if lambda_ is None: try: lambda_ = DICT_DANN[str(source) + "_" + str(target)] except: lambda_ = cross_val( "DANN", X, y, src_index, None, tgt_train_index, params=[0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1], fit_params=fit_params, cv=5, get_encoder=get_encoder, get_task=get_task) try: DICT_DANN[str(source) + "_" + str(target)] = lambda_ except: pass print("lambda: %.3f" % lambda_) model = BaggingModels(DANN, get_encoder=get_encoder, get_task=get_task, C=C, lambda_=lambda_, n_models=n_models, n_jobs=n_jobs, random_state=random_state) resize_tgt_ind = np.array([ tgt_train_index[i % len(tgt_train_index)] for i in range(len(src_index)) ]) model.fit(X, y, index=[src_index, resize_tgt_ind, tgt_train_index], **fit_params) y_pred = model.predict(X) score = mean_squared_error(y[tgt_test_index], y_pred[tgt_test_index]) _line = pd.DataFrame( [[random_state, method, source, target, score]], columns=['state', 'method', 'source', 'target', 'score']) df = df.append(_line, ignore_index=True) if save: df.to_csv(save_path + ".csv") print('Target_score: %.3f' % score) K.clear_session() return df
def main(): parser = argparse.ArgumentParser( description=( 'Create a list of domains with standardized URLs.' 'Do this either from a primary CSV or from a provided list.'), formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( 'dest_file', type=str, help=('Destination file for the combined data. ' 'The normalized domain will always be in the first column, ' 'followed by the columns to keep from the primary file, ' 'followed by the columns to keep from the secondary file.')) parser.add_argument( '-p', '--primary_csv', type=str, help=('A CSV file containing domains. ' 'All domains from this file will be kept in the final output.')) parser.add_argument('-s', '--secondary_csv', type=str, default=None, help=('A CSV containing domain data. ' 'Domains from this file will not be kept ' 'unless they appear in the primary file.')) parser.add_argument( '-domain1', '--primary_domain_col', type=int, default=0, help='The column with the domain in the primary source file.') parser.add_argument( '-data1', '--primary_data_cols', type=int, nargs='+', help= 'Columns with additional data from the primary source file to keep in the output.' ) parser.add_argument('-delim1', '--primary_delim', type=str, default='\t', help='The delimiter in the primary source file.') parser.add_argument( '-skip1', '--primary_skip_rows', type=int, default=0, help='The number of header rows in the primary source file to skip.') parser.add_argument( '-domain2', '--secondary_domain_col', type=int, default=0, help='The column with the domain in the secondary source file.') parser.add_argument( '-data2', '--secondary_data_cols', type=int, nargs='+', help= 'Columns with additional data from the secondary source file to keep in the output..' ) parser.add_argument('-delim2', '--secondary_delim', type=str, default='\t', help='The delimiter in the secondary source file.') parser.add_argument( '-skip2', '--secondary_skip_rows', type=int, default=0, help='The number of header rows in the secondary source file to skip.') parser.add_argument('-ddelim', '--dest_delim', type=str, default='\t', help='The delimiter in the destination file.') parser.add_argument( '-dhead', '--dest_col_headers', type=str, nargs='+', help= ('The column headers in the destination file. ' 'Must match the number of columns being kept from both source files, ' 'plus the first column for the domain.')) parser.add_argument('-exclude', '--exclude_domains', type=str, nargs='+', help='A list of domains to exclude.') parser.add_argument( '-include', '--include_domains', type=str, nargs='+', help='A list of additional domains to include in the final list.') args = parser.parse_args() if os.path.dirname(args.dest_file) != '' and not os.path.exists( os.path.dirname(args.dest_file)): os.makedirs(os.path.dirname(args.dest_file)) if (args.primary_csv is None or not os.path.exists( args.primary_csv)) and args.include_domains is None: raise ValueError('No input provided.') # read the CSVs logging.debug('Reading primary file.') if args.primary_csv is not None: primary_data = read_domain_data(args.primary_csv, args.primary_domain_col, args.primary_data_cols, args.primary_delim, args.primary_skip_rows) else: primary_data = {} if args.include_domains is not None: for raw_d in args.include_domains: d = domain(raw_d) if d not in primary_data: primary_data[d] = [] logging.debug('Reading secondary file.') if args.secondary_csv is not None: secondary_data = read_domain_data(args.secondary_csv, args.secondary_domain_col, args.secondary_data_cols, args.secondary_delim, args.secondary_skip_rows) else: secondary_data = {} # combine the data from both files into rows excluded_domains = frozenset( args.exclude_domains ) if args.exclude_domains is not None else frozenset() combined_rows = [] for d in primary_data.keys(): if d in excluded_domains: logging.info('Skipping {}'.format(d)) continue new_row = [d] new_row.extend(primary_data[d]) if d in secondary_data: new_row.extend(secondary_data[d]) combined_rows.append(new_row) sorted_data = sorted(combined_rows) # write the data to the dest file logging.debug('Writing combined file.') with open(args.dest_file, 'w') as f: writer = csv.writer(f, delimiter=args.dest_delim) if args.dest_col_headers is not None: sorted_data.insert(0, args.dest_col_headers) else: sorted_data.insert(0, ['domain']) writer.writerows(sorted_data)
for match in call[crawler].crawl_full(): with matchfile(match) as fw: fw.write(str(match) + '\n') if 0 < match.tostart < 3600 and not tbd in match.teams: pool[match.webpage] = match except KeyboardInterrupt: sys.exit(0) except Exception as e: cooldown[crawler] = time.time() + cd print('{0} cooldowned for {1}s for exception'.format(crawler, cd)) print(e) cd += 0 with open('httpalias', 'a') as fw: G = networkx.Graph() print('{0} matches start <1h'.format(len(G))) for s1, s2 in itertools.combinations(pool.values(), 2): if domain(s1) == domain(s2) or not s1 == s2: continue G.add_edge(s1.webpage, s2.webpage) for idx, c in enumerate(networkx.connected_components(G)): fw.write(' '.join(c) + '\n') print('###MATCHED### {0}: '.format(idx) + ' '.join(c)) profit = (max([pool[w].returns[0] for w in c]) - 1) * (max([pool[w].returns[1] for w in c]) - 1) - 1 if profit < profit_trs: continue print(red(' PROFIT {0:.3}'.format(profit))) for w in c: s = pool[w] print(red(' {0} {1} ({2}): {3}/{4} {5} {6}'.format(s.teams[0], s.teams[1], s.series, s.returns, s.poolsize, domain(s), s.tostart))) time.sleep(5)