Python domainの例、utils.domain Pythonの例

コード例 #1

0

ファイルを表示

ファイル: views.py プロジェクト: manpreetnarang/Hacker-News-Reader

	def get_context_data(self, **kwargs):
		context = super(IndexJsonView, self).get_context_data(**kwargs)
		stories = context['stories']
		context = self.clean_context(context)
		context['stories'] = []
		for story in stories:
			story_json = {
				'id': story.id,
				'title': story.title,
				'selfpost': story.selfpost,
				'poll': story.poll,
				'username': story.username,
				'score': story.score,
				'comments': story.comments,
				'story_type': story.story_type,
				'time': format(story.time, 'r'),
				'time_unix': format(story.time, 'U'),
				'cache': format(story.cache, 'r'),
				'cache_unix': format(story.cache, 'U')
			}
			if not story.selfpost:
				story_json['url'] = story.url
				story_json['domain'] = domain(story.url)
			context['stories'].append(story_json)
		context['page'] = {'current': stories.number, 'total': stories.paginator.num_pages}
		return context

コード例 #2

0

ファイルを表示

ファイル: views.py プロジェクト: manpreetnarang/Hacker-News-Reader

 def get_context_data(self, **kwargs):
     context = super(IndexJsonView, self).get_context_data(**kwargs)
     stories = context['stories']
     context = self.clean_context(context)
     context['stories'] = []
     for story in stories:
         story_json = {
             'id': story.id,
             'title': story.title,
             'selfpost': story.selfpost,
             'poll': story.poll,
             'username': story.username,
             'score': story.score,
             'comments': story.comments,
             'story_type': story.story_type,
             'time': format(story.time, 'r'),
             'time_unix': format(story.time, 'U'),
             'cache': format(story.cache, 'r'),
             'cache_unix': format(story.cache, 'U')
         }
         if not story.selfpost:
             story_json['url'] = story.url
             story_json['domain'] = domain(story.url)
         context['stories'].append(story_json)
     context['page'] = {
         'current': stories.number,
         'total': stories.paginator.num_pages
     }
     return context

コード例 #3

0

ファイルを表示

 def fix_url(cls, url):
     id_ = get_id(url)
     if re.find(r'^https?://', id_):
         return url
     if re.find(r'^https?://', url):
         domain = utils.domain(url)
     else:
         domain = 'www.nicovideo.jp'
     return 'https://{}/watch/{}'.format(domain, id_)

コード例 #4

0

ファイルを表示

def read_domain_data(filepath, domain_col, data_cols_to_read, delimiter,
                     skip_rows):
    domains = {}
    row_count = 0  # for debugging

    with open(filepath, 'r') as f:
        reader = csv.reader(f, delimiter=delimiter)

        # skip headers
        if skip_rows is not None:
            for _ in range(skip_rows):
                row_count += 1
                next(reader)

        # process the rows
        for row in reader:
            if domain_col >= len(row):
                raise ValueError('Invalid domain index: {}, {}'.format(
                    domain_col, ', '.join(row)))
            d = domain(row[domain_col])

            if d in domains:
                logging.info(
                    'Domain has already been processed: {}. Skipping new values.'
                    .format(d))
                logging.info('Existing data: {}'.format(', '.join(domains[d])))
                logging.info('New data: {}'.format(', '.join(row)))

            new_row = []
            if data_cols_to_read is not None:
                for idx in data_cols_to_read:
                    if idx >= len(row):
                        raise ValueError('Invalid index: {}, {}'.format(
                            idx, ', '.join(row)))
                    elif idx == domain_col:
                        logging.info(
                            'Data column the same as the domain column. Skipping.'
                        )
                    else:
                        new_row.append(row[idx])

            row_count += 1
            domains[d] = new_row
    return domains

コード例 #5

0

ファイルを表示

ファイル: views.py プロジェクト: manpreetnarang/Hacker-News-Reader

 def get_context_data(self, **kwargs):
     context = super(CommentsJsonView, self).get_context_data(**kwargs)
     story = context.get('story', None)
     polls = context.get('polls', None)
     total_votes = context.get('total_votes', None)
     root_comments = cache_tree_children(context.get('nodes', None))
     context = self.clean_context(context)
     if story:
         context['story'] = {
             'id': story.id,
             'title': story.title,
             'selfpost': story.selfpost,
             'poll': story.poll,
             'score': story.score,
             'username': story.username,
             'time': format(story.time, 'r'),
             'time_unix': format(story.time, 'U'),
             'comments': story.comments,
             'cache': format(story.cache, 'r'),
             'cache_unix': format(story.cache, 'U')
         }
         if story.selfpost:
             context['story']['selfpost_text'] = story.selfpost_text
         else:
             context['story']['url'] = story.url
             context['story']['domain'] = domain(story.url)
     if polls:
         context['polls'] = []
         for poll in polls:
             context['polls'].append({
                 'name':
                 poll.name,
                 'votes':
                 poll.score,
                 'percentage':
                 poll_percentage(poll.score, total_votes, 2)
             })
     context['comments'] = []
     for root_comment in root_comments:
         context['comments'].append(
             self.recursive_node_to_dict(root_comment, bool(story)))
     return context

コード例 #6

0

ファイルを表示

 def prepare_context(self):
     story = self.context.get('story')
     polls = self.context.get('polls')
     total_votes = self.context.get('total_votes')
     root_comments = self.list_to_nested(self.context.get('comments', []))
     self.clean_context()
     if story:
         self.context['story'] = {
             'id': story.id,
             'title': story.title,
             'selfpost': story.selfpost,
             'poll': story.poll,
             'score': story.score,
             'username': story.username,
             'time': format(story.time, 'r'),
             'time_unix': format(story.time, 'U'),
             'comments': story.comments,
             'cache': format(story.cache, 'r'),
             'cache_unix': format(story.cache, 'U')
         }
         if story.selfpost:
             self.context['story']['selfpost_text'] = story.selfpost_text
         else:
             self.context['story']['url'] = story.url
             self.context['story']['domain'] = utils.domain(story.url)
         if story.dead:
             self.context['story']['dead'] = True
     if polls:
         self.context['polls'] = []
         for poll in polls:
             self.context['polls'].append({
                 'name': poll.name,
                 'votes': poll.score,
                 'percentage': utils.poll_percentage(poll.score, total_votes, 2)
             })
     self.context['comments'] = []
     # recursive_comment_to_dict and list_to_nested could be combined to reduce looping
     for root_comment in root_comments:
         self.context['comments'].append(self.recursive_comment_to_dict(root_comment, bool(story)))
     return self.context

コード例 #7

0

ファイルを表示

ファイル: views.py プロジェクト: manpreetnarang/Hacker-News-Reader

	def get_context_data(self, **kwargs):
		context = super(CommentsJsonView, self).get_context_data(**kwargs)
		story = context.get('story', None)
		polls = context.get('polls', None)
		total_votes = context.get('total_votes', None)
		root_comments = cache_tree_children(context.get('nodes', None))
		context = self.clean_context(context)
		if story:
			context['story'] = {
				'id': story.id,
				'title': story.title,
				'selfpost': story.selfpost,
				'poll': story.poll,
				'score': story.score,
				'username': story.username,
				'time': format(story.time, 'r'),
				'time_unix': format(story.time, 'U'),
				'comments': story.comments,
				'cache': format(story.cache, 'r'),
				'cache_unix': format(story.cache, 'U')
			}
			if story.selfpost:
				context['story']['selfpost_text'] = story.selfpost_text
			else:
				context['story']['url'] = story.url
				context['story']['domain'] = domain(story.url)
		if polls:
			context['polls'] = []
			for poll in polls:
				context['polls'].append({
					'name': poll.name,
					'votes': poll.score,
					'percentage': poll_percentage(poll.score, total_votes, 2)
				})
		context['comments'] = []
		for root_comment in root_comments:
			context['comments'].append(self.recursive_node_to_dict(root_comment, bool(story)))
		return context

コード例 #8

0

ファイルを表示

 def prepare_context(self):
     stories = self.context['stories']
     self.clean_context()
     self.context['stories'] = []
     for story in stories:
         story_json = {
             'id': story.id,
             'title': story.title,
             'selfpost': story.selfpost,
             'poll': story.poll,
             'username': story.username,
             'score': story.score,
             'comments': story.comments,
             'story_type': story.story_type,
             'time': format(story.time, 'r'),
             'time_unix': format(story.time, 'U'),
             'cache': format(story.cache, 'r'),
             'cache_unix': format(story.cache, 'U')
         }
         if not story.selfpost:
             story_json['url'] = story.url
             story_json['domain'] = utils.domain(story.url)
         self.context['stories'].append(story_json)
     self.context['page'] = {'current': stories.number, 'total': stories.paginator.num_pages}

コード例 #9

0

ファイルを表示

             with matchfile(match) as fw:
                 fw.write(str(match) + '\n')
             if 0 < match.tostart < 3600 and not tbd in match.teams:
                 pool[match.webpage] = match
     except KeyboardInterrupt:
         sys.exit(0)
     except Exception as e:
         cooldown[crawler] = time.time() + cd
         print('{0} cooldowned for {1}s for exception'.format(crawler, cd))
         print(e)
         cd += 0
 with open('httpalias', 'a') as fw:
     G = networkx.Graph()
     print('{0} matches start <1h'.format(len(G)))
     for s1, s2 in itertools.combinations(pool.values(), 2):
         if domain(s1) == domain(s2) or not s1 == s2:
             continue
         G.add_edge(s1.webpage, s2.webpage)
     for idx, c in enumerate(networkx.connected_components(G)):
         fw.write(' '.join(c) + '\n')
         print('###MATCHED### {0}: '.format(idx) + ' '.join(c))
         profit = (max([pool[w].returns[0] for w in c]) -
                   1) * (max([pool[w].returns[1] for w in c]) - 1) - 1
         if profit < profit_trs:
             continue
         print(red('    PROFIT {0:.3}'.format(profit)))
         for w in c:
             s = pool[w]
             print(
                 red('    {0} {1} ({2}): {3}/{4} {5} {6}'.format(
                     s.teams[0], s.teams[1], s.series, s.returns,

コード例 #10

0

ファイルを表示

def get_videos(url, cw=None):
    '''
    get_videos
    '''
    print_ = get_print(cw)

    if '/users/' in url:
        mode = 'users'
        username = url.split('/users/')[1].split('/')[0]
    elif '/pornstar/' in url:
        mode = 'pornstar'
        username = url.split('/pornstar/')[1].split('/')[0]
    elif '/model/' in url:
        mode = 'model'
        username = url.split('/model/')[1].split('/')[0]
    elif '/channels/' in url:
        mode = 'channels'
        username = url.split('/channels/')[1].split('/')[0]
    elif '/playlist/' in url:
        mode = 'playlist'
        username = url.split('/playlist/')[1].split('/')[0]
    else:
        raise Exception('Not supported url')
    username = username.split('?')[0].split('#')[0]

    session = Session()

    domain = utils.domain(url)

    if mode in ['pornstar']:
        url_main = 'https://{}/{}/{}'.format(domain, mode, username)
        html = downloader.read_html(url_main, session=session)
        soup = Soup(html)
        soup = fix_soup(soup, url_main, session, cw)
        for a in soup.findAll('a'):
            if '/{}/{}/videos/upload'.format(mode, username) in a.attrs.get('href', ''):
                free = True
                break
        else:
            free = False
        print_('free: {}'.format(free))

    # Range
    max_pid = get_max_range(cw, 500)
    max_pid = min(max_pid, 2000)#

    html = downloader.read_html(url, session=session)
    soup = fix_soup(Soup(html), url, session, cw)

    info = {}

    # get title
    h1 = soup.find('h1')
    if h1:
        header = 'Playlist'
        title = h1.find(id='watchPlaylist')
    else:
        title = None
    if not title:
        header = 'Channel'
        profile = soup.find('div', class_='profileUserName')
        wrapper = soup.find('div', class_='titleWrapper')
        bio = soup.find('div', class_='withBio')
        title = soup.find('h1', {'itemprop':'name'})
        if not title and profile:
            title = profile.a
        if not title and wrapper:
            title = wrapper.h1
        if not title and bio:
            title = bio.h1
    if not title:
        raise Exception('No title')
    #print(title)
    info['title'] = '[{}] {}'.format(header, title.text.strip())
    token = re.find('''token *= *['"](.*?)['"]''', html)
    print_('token: {}'.format(token))

    # get links
    hrefs = []
    fail = 0
    for p in range(1, 1+100):
        try:
            if mode in ['users', 'model']:
                if mode == 'users':
                    url_api = 'https://{}/users/{}/videos/public/'\
                              'ajax?o=mr&page={}'.format(domain, username, p)
                elif mode == 'model':
                    url_api = 'https://{}/model/{}/videos/upload/'\
                              'ajax?o=mr&page={}'.format(domain, username, p)
                r = session.post(url_api)
                soup = Soup(r.text)
                if soup.find('h1'):
                    print('break: h1')
                    break
            elif mode in ['pornstar']:
                if free:
                    url_api = 'https://{}/{}/{}/videos/upload'\
                              '?page={}'.format(domain, mode, username, p)
                    soup = downloader.read_soup(url_api, session=session)
                    soup = fix_soup(soup, url_api, session, cw)
                    soup = soup.find('div', class_='videoUList')
                else:
                    url_api = 'https://{}/{}/{}?page={}'.format(domain, mode, username, p)
                    soup = downloader.read_soup(url_api, session=session)
                    soup = fix_soup(soup, url_api, session, cw)
                    soup = soup.find('ul', class_='pornstarsVideos')
            elif mode in ['channels']:
                url_api = 'https://{}/{}/{}/videos?page={}'.format(domain, mode, username, p)
                soup = downloader.read_soup(url_api, session=session)
                soup = fix_soup(soup, url_api, session, cw)
                try:
                    soup = soup.find('div', {'id': 'channelsBody'}).find('div', class_='rightSide')
                except:
                    break
            elif mode in ['playlist']:
                #url_api = 'https://{}/playlist/viewChunked?id={}&offset={}&itemsPerPage=40'.format(domain, username, len(hrefs))
                if token is None:
                    raise Exception('no token')
                url_api = 'https://{}/playlist/viewChunked?id={}&token={}&page={}'.format(domain, username, token, p)
                soup = downloader.read_soup(url_api, session=session)
            else:
                raise NotImplementedError(mode)
            fail = 0
        except Exception as e:
            print_(e)
            fail += 1
            if fail < 2:
                continue
            else:
                break
        finally:
            print_('{}  ({})'.format(url_api, len(hrefs)))

        if cw and not cw.alive:
            return

        lis = soup.findAll('li', class_='videoblock')
        if not lis:
            print_('break: no lis')
            break

        if getattr(soup.find('title'), 'text', '').strip() == 'Page Not Found':
            print_('Page Not Found')
            break

        c = 0
        for li in lis:
            a = li.find('a')
            href = a.attrs['href']
            href = urljoin(url, href)
            if href in hrefs:
                continue
            c += 1
            if href.startswith('javascript:'): # Remove Pornhub Premium
                print(href)
                continue
            hrefs.append(href)
        if c == 0:
            print('c==0')
            break
        print(c) # 1320

        if len(hrefs) >= max_pid:
            break

    if cw:
        hrefs = filter_range(hrefs, cw.range)

    info['hrefs'] = hrefs

    return info

コード例 #11

0

ファイルを表示

def suitable(url):
    if domain(url.lower(), 2) not in ['weibo.com', 'weibo.cn']:
        return False
    if '/tv/' in url.lower():
        return False
    return True

コード例 #12

0

ファイルを表示

ファイル: uci_experiments.py プロジェクト: pzuvela/WANN

def run_uci_experiments(method, get_base_model, get_encoder, get_task, C, C_w,
                        epochs, batch_size, n_models, n_jobs, n_target_labeled,
                        random_state, save, **kwargs):
    """
    Run experiments on superconductivity dataset
    
    Parameters
    ----------
    method: str
        name of the method used: should one of the following:
            - NoReweight
            - TrAdaBoost
            - WANN
            - SrcOnly
            - TgtOnly
            
    get_base_model: callable
        constructor for the base learner, should takes
        C, shape, activation and name as arguments
        
    get_encoder: callable
        constructor for the DANN encoder network
        
    get_task: callable
        constructor for the DANN task network
        
    C: float
        projecting constant for networks (args of get_base_model)
        
    C_w: float
        projecting constant for WANN weighting network
        
    lambda_: float
        DANN trade-off parameter
        
    epochs: int
        number of epochs
        
    batch_size: int
        size of the batches
        
    n_models: int
        number of bagged models
        
    n_jobs: int
        number of jobs to run in parallel, if n_jobs=None
        no paralllel computing is done.
        
    n_target_labeled: int
        number of training target labeled data
        
    random_state: int
        seed number of the experiment
        
    save: boolean
        whether to save results in csv or not
        
    Returns
    -------
    df: DataFrame
        dataframe containing mse scores
    """
    print("Experiment for method: %s" % method)
    print("\n")

    folder = os.path.dirname(__file__)
    save_path = folder + "/../dataset/results/" + "uci_" + method + "_" + str(
        random_state)
    df = pd.DataFrame(columns=['state', 'method', 'source', 'target', 'score'])
    if save:
        try:
            df.to_csv(save_path + ".csv")
        except:
            try:
                os.mkdir(folder + "/../dataset/results")
            except:
                os.mkdir(folder + "/../dataset")
                os.mkdir(folder + "/../dataset/results")
            df.to_csv(save_path + ".csv")

    for source in [0, 1, 2, 3]:

        print("############# " + str(source) + " #############")

        target_list = [0, 1, 2, 3]
        target_list.remove(source)
        for target in target_list:

            print("--------- %s ----------" % str(target))

            data, X, y, cuts, split_col = superconduct()
            shape = X.shape[1]

            src_index = domain(data, cuts, split_col, source)
            tgt_index = domain(data, cuts, split_col, target)

            np.random.seed(0)
            tgt_train_index, tgt_test_index = train_test_split(
                tgt_index, train_size=n_target_labeled)
            train_index = np.concatenate((src_index, tgt_train_index))

            std_sc = StandardScaler()
            std_sc.fit(X[train_index])
            X = std_sc.transform(X)
            y = (y - y[train_index].mean()) / y[train_index].std()

            base_estimator = BaggingModels(func=get_base_model,
                                           n_models=n_models,
                                           n_jobs=n_jobs,
                                           shape=shape,
                                           C=C,
                                           random_state=random_state)
            fit_params = dict(epochs=epochs, batch_size=batch_size, verbose=0)

            if method == "SrcOnly":
                model = copy.deepcopy(base_estimator)
                model.fit(X[src_index], y[src_index], **fit_params)

            if method == "TgtOnly":
                model = copy.deepcopy(base_estimator)
                model.fit(X[tgt_train_index], y[tgt_train_index], **fit_params)

            if method == "NoReweight":
                model = copy.deepcopy(base_estimator)
                model.fit(X[train_index], y[train_index], **fit_params)

            if method == "TrAdaBoost":
                model = TwoStageTrAdaBoostR2(func=get_base_model,
                                             random_state=random_state,
                                             n_jobs=n_jobs,
                                             C=C,
                                             shape=X.shape[1])
                model.fit(X, y, [src_index, tgt_train_index], **fit_params)

            if method == "WANN":
                model = BaggingModels(WANN,
                                      get_base_model=get_base_model,
                                      C=C,
                                      C_w=C_w,
                                      n_models=n_models,
                                      n_jobs=n_jobs,
                                      random_state=random_state)
                model.fit(X,
                          y,
                          index=[src_index, tgt_train_index],
                          **fit_params)

            if method == "DANN":
                if lambda_ is None:
                    try:
                        lambda_ = DICT_DANN[str(source) + "_" + str(target)]
                    except:
                        lambda_ = cross_val(
                            "DANN",
                            X,
                            y,
                            src_index,
                            None,
                            tgt_train_index,
                            params=[0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1],
                            fit_params=fit_params,
                            cv=5,
                            get_encoder=get_encoder,
                            get_task=get_task)
                        try:
                            DICT_DANN[str(source) + "_" +
                                      str(target)] = lambda_
                        except:
                            pass
                print("lambda: %.3f" % lambda_)
                model = BaggingModels(DANN,
                                      get_encoder=get_encoder,
                                      get_task=get_task,
                                      C=C,
                                      lambda_=lambda_,
                                      n_models=n_models,
                                      n_jobs=n_jobs,
                                      random_state=random_state)
                resize_tgt_ind = np.array([
                    tgt_train_index[i % len(tgt_train_index)]
                    for i in range(len(src_index))
                ])
                model.fit(X,
                          y,
                          index=[src_index, resize_tgt_ind, tgt_train_index],
                          **fit_params)

            y_pred = model.predict(X)
            score = mean_squared_error(y[tgt_test_index],
                                       y_pred[tgt_test_index])
            _line = pd.DataFrame(
                [[random_state, method, source, target, score]],
                columns=['state', 'method', 'source', 'target', 'score'])
            df = df.append(_line, ignore_index=True)
            if save:
                df.to_csv(save_path + ".csv")
            print('Target_score: %.3f' % score)
            K.clear_session()
    return df

コード例 #13

0

ファイルを表示

def main():
    parser = argparse.ArgumentParser(
        description=(
            'Create a list of domains with standardized URLs.'
            'Do this either from a primary CSV or from a provided list.'),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument(
        'dest_file',
        type=str,
        help=('Destination file for the combined data. '
              'The normalized domain will always be in the first column, '
              'followed by the columns to keep from the primary file, '
              'followed by the columns to keep from the secondary file.'))

    parser.add_argument(
        '-p',
        '--primary_csv',
        type=str,
        help=('A CSV file containing domains. '
              'All domains from this file will be kept in the final output.'))

    parser.add_argument('-s',
                        '--secondary_csv',
                        type=str,
                        default=None,
                        help=('A CSV containing domain data. '
                              'Domains from this file will not be kept '
                              'unless they appear in the primary file.'))

    parser.add_argument(
        '-domain1',
        '--primary_domain_col',
        type=int,
        default=0,
        help='The column with the domain in the primary source file.')

    parser.add_argument(
        '-data1',
        '--primary_data_cols',
        type=int,
        nargs='+',
        help=
        'Columns with additional data from the primary source file to keep in the output.'
    )

    parser.add_argument('-delim1',
                        '--primary_delim',
                        type=str,
                        default='\t',
                        help='The delimiter in the primary source file.')

    parser.add_argument(
        '-skip1',
        '--primary_skip_rows',
        type=int,
        default=0,
        help='The number of header rows in the primary source file to skip.')

    parser.add_argument(
        '-domain2',
        '--secondary_domain_col',
        type=int,
        default=0,
        help='The column with the domain in the secondary source file.')

    parser.add_argument(
        '-data2',
        '--secondary_data_cols',
        type=int,
        nargs='+',
        help=
        'Columns with additional data from the secondary source file to keep in the output..'
    )

    parser.add_argument('-delim2',
                        '--secondary_delim',
                        type=str,
                        default='\t',
                        help='The delimiter in the secondary source file.')

    parser.add_argument(
        '-skip2',
        '--secondary_skip_rows',
        type=int,
        default=0,
        help='The number of header rows in the secondary source file to skip.')

    parser.add_argument('-ddelim',
                        '--dest_delim',
                        type=str,
                        default='\t',
                        help='The delimiter in the destination file.')

    parser.add_argument(
        '-dhead',
        '--dest_col_headers',
        type=str,
        nargs='+',
        help=
        ('The column headers in the destination file. '
         'Must match the number of columns being kept from both source files, '
         'plus the first column for the domain.'))

    parser.add_argument('-exclude',
                        '--exclude_domains',
                        type=str,
                        nargs='+',
                        help='A list of domains to exclude.')

    parser.add_argument(
        '-include',
        '--include_domains',
        type=str,
        nargs='+',
        help='A list of additional domains to include in the final list.')

    args = parser.parse_args()

    if os.path.dirname(args.dest_file) != '' and not os.path.exists(
            os.path.dirname(args.dest_file)):
        os.makedirs(os.path.dirname(args.dest_file))

    if (args.primary_csv is None or not os.path.exists(
            args.primary_csv)) and args.include_domains is None:
        raise ValueError('No input provided.')

    # read the CSVs
    logging.debug('Reading primary file.')
    if args.primary_csv is not None:
        primary_data = read_domain_data(args.primary_csv,
                                        args.primary_domain_col,
                                        args.primary_data_cols,
                                        args.primary_delim,
                                        args.primary_skip_rows)
    else:
        primary_data = {}

    if args.include_domains is not None:
        for raw_d in args.include_domains:
            d = domain(raw_d)
            if d not in primary_data:
                primary_data[d] = []

    logging.debug('Reading secondary file.')
    if args.secondary_csv is not None:
        secondary_data = read_domain_data(args.secondary_csv,
                                          args.secondary_domain_col,
                                          args.secondary_data_cols,
                                          args.secondary_delim,
                                          args.secondary_skip_rows)
    else:
        secondary_data = {}

    # combine the data from both files into rows
    excluded_domains = frozenset(
        args.exclude_domains
    ) if args.exclude_domains is not None else frozenset()
    combined_rows = []
    for d in primary_data.keys():
        if d in excluded_domains:
            logging.info('Skipping {}'.format(d))
            continue
        new_row = [d]
        new_row.extend(primary_data[d])
        if d in secondary_data:
            new_row.extend(secondary_data[d])
        combined_rows.append(new_row)
    sorted_data = sorted(combined_rows)

    # write the data to the dest file
    logging.debug('Writing combined file.')
    with open(args.dest_file, 'w') as f:
        writer = csv.writer(f, delimiter=args.dest_delim)
        if args.dest_col_headers is not None:
            sorted_data.insert(0, args.dest_col_headers)
        else:
            sorted_data.insert(0, ['domain'])

        writer.writerows(sorted_data)

コード例 #14

0

ファイルを表示

ファイル: collect.py プロジェクト: wangbx66/BoboVovo

            for match in call[crawler].crawl_full():
                with matchfile(match) as fw:
                    fw.write(str(match) + '\n')
                if 0 < match.tostart < 3600 and not tbd in match.teams:
                    pool[match.webpage] = match
        except KeyboardInterrupt:
            sys.exit(0)
        except Exception as e:
            cooldown[crawler] = time.time() + cd
            print('{0} cooldowned for {1}s for exception'.format(crawler, cd))
            print(e)
            cd += 0
    with open('httpalias', 'a') as fw:
        G = networkx.Graph()
        print('{0} matches start <1h'.format(len(G)))
        for s1, s2 in itertools.combinations(pool.values(), 2):
            if domain(s1) == domain(s2) or not s1 == s2:
                continue
            G.add_edge(s1.webpage, s2.webpage)
        for idx, c in enumerate(networkx.connected_components(G)):
            fw.write(' '.join(c) + '\n')
            print('###MATCHED### {0}: '.format(idx) + ' '.join(c))
            profit = (max([pool[w].returns[0] for w in c]) - 1) * (max([pool[w].returns[1] for w in c]) - 1) - 1
            if profit < profit_trs:
                continue
            print(red('    PROFIT {0:.3}'.format(profit)))
            for w in c:
                s = pool[w]
                print(red('    {0} {1} ({2}): {3}/{4} {5} {6}'.format(s.teams[0], s.teams[1], s.series, s.returns, s.poolsize, domain(s), s.tostart)))
    time.sleep(5)