Example #1
0
def test_processor(feature_size, chunksize):
    """Tests FileProcessor.

    :param feature_size: int, size of feature vector.
    :param chunksize: int, FileProcessor chunk size.
    """
    sep = '\t'
    n_rows = 50
    feature = 3

    with TemporaryDirectory() as dir:
        input_path = os.path.join(dir, 'data.tsv')
        output_path = os.path.join(dir, 'data_proc.tsv')

        data, feature_values = generate_data(n_rows=n_rows,
                                             feature=feature,
                                             feature_size=feature_size,
                                             seed=42)

        data.to_csv(input_path, sep=sep, index=False)

        reader_params = {
            'chunksize': chunksize,
            'sep': sep,
        }

        transformers = (
            Standardizer,
            MaxIndex,
            MaxFeatureAbsMeanDiff,
        )
        processor = FileProcessor(transformers, reader_params=reader_params)
        processor.train(input_path)
        processor.process(input_path, output_path)

        processed = pd.read_csv(output_path, sep=sep)

    # check feature_{i}_stand_{index}
    expected_stand = (feature_values -
                      feature_values.mean(axis=0)) / feature_values.std(axis=0,
                                                                        ddof=1)
    stand = processed.filter(regex=f'feature_{feature}_stand_[0-9]+')
    assert np.allclose(expected_stand, stand)
    assert np.allclose(stand.mean(axis=0), 0)
    assert np.allclose(stand.std(axis=0, ddof=1), 1)

    # check max_feature_{i}_index
    expected_max = feature_values.max(axis=1)
    max_index = processed[f'max_feature_{feature}_index'].values
    max_mask = (max_index.reshape(
        (-1, 1)) == np.arange(feature_values.shape[1]).reshape((1, -1)))
    fact_max = feature_values[max_mask]
    assert np.allclose(expected_max, fact_max)

    # check max_feature_{i}_abs_mean_diff
    expected_max_mean = np.broadcast_to(feature_values.mean(axis=0),
                                        shape=max_mask.shape)[max_mask]
    expected_abs_mean_diff = np.abs(expected_max - expected_max_mean)
    abs_mean_diff = processed[f'max_feature_{feature}_abs_mean_diff']
    assert np.allclose(expected_abs_mean_diff, abs_mean_diff)
Example #2
0
def backup(config, database):
    """
    Process directoreis to be backed up
    """
    logger = logging.getLogger('mylog')

    clean_removed(config, database)

    if not consistency_check(config, database):
        logger.warning('Consistency check detected problems!')
        if not query_yes_no('Continue?'):
            sys.exit()

    dirs = config.get('Backup', 'to_backup').split()
    logger.info('Directories to backup: ' + ','.join(dirs))

    exclude_dirs = config.get('Backup', 'to_exclude').split()
    logger.info('Directories to exclude: ' + ','.join(exclude_dirs))


    file_proc = FileProcessor(config, database, encrypt_file)

    #Count files to show progress later
    total = 0

    for directory in dirs:
        for subdir, _, files in os.walk(directory):
            if  subdir in exclude_dirs:
                continue

            for single_file in files:
                fpath = os.path.join(subdir, single_file)
                total = total + 1

    count = 0
    for directory in dirs:
        logger.debug('Processing directory' + directory)
        for subdir, dirs, files in os.walk(directory):
            if  subdir in exclude_dirs:
                logger.debug('Skipping directory' + subdir)
                continue

            for single_file in files:
                fpath = os.path.join(subdir, single_file)
                logger.debug('Processing file ' + fpath)
                logger.info(str((count * 100) / total) + ' %')
                file_proc.process(fpath)
                count = count + 1
class SecondPassCrawler:
    session = attr.ib()
    navigator = attr.ib(init=False)
    file_processor = attr.ib(init=False)

    def __attrs_post_init__(self):
        self.navigator = SeleniumNavigator(loading_strategy='none')
        self.file_processor = FileProcessor()

    def exit(self):
        self.navigator.close_browser()
        self.session.close()

    def get_urls(self):
        # results = self.session.query(Report).all()
        # return (report.url for report in results)
        return ['http://media.ethics.ga.gov/search/Campaign/Campaign_ReportOptions.aspx?NameID=16067&FilerID=C2012000744&CDRID=59991']

    def add_scrapelog_to_db(self, url, content, dtime):
        slog = ScrapeLog(scrape_date=dtime,
                         raw_data=content,
                         page_url=url)
        try:
            self.session.add(slog)
            self.session.commit()
        except Exception as e:
            self.session.rollback()
            logging.info(e)

    def crawl_download_link(self):
        parser = CSVLinkParser(self.navigator.page_source())
        parsed_link = parser.parse()
        if parsed_link is not None:
            logging.info(f'Parsed link: {parsed_link}')
            url = self.navigator.get_current_url()
            self.navigator.click_link(parsed_link)
            logging.info('Clicking download link for csv file.')
            content, dtime = self.file_processor.process()
            self.add_scrapelog_to_db(url, content, dtime)
            self.file_processor.delete_csv()

    def crawl_view_contributions_ids(self):
        logging.info(f'Current page: {self.navigator.get_current_url()}')
        parser = ContributionsViewParser(self.navigator.page_source())
        parsed_link = parser.parse()
        if parsed_link is not None:
            logging.info(f'Parsed link: {parsed_link}')
            self.navigator.click_link(parsed_link)
            self.navigator.wait_for_csv_link()
            self.crawl_download_link()

    def crawl(self):
        urls = self.get_urls()
        for url in urls:
            logging.info(f'Current url: {url}')
            self.navigator.navigate(url)
            self.navigator.wait_for_contributions_id() 
            self.crawl_view_contributions_ids()
Example #4
0
        header = True
        for seed in tqdm(range(100), desc='generating data'):
            data, _ = generate_data(
                n_rows=100_000,
                feature=feature,
                feature_size=512,
                seed=seed
            )
            data.to_csv(
                input_path,
                sep=sep,
                index=False,
                mode='a',
                header=header
            )
            header = False

        reader_params = {
            'chunksize': 10_000,
            'sep': sep,
        }

        transformers = (
            Standardizer,
            MaxIndex,
            MaxFeatureAbsMeanDiff,
        )
        processor = FileProcessor(transformers, reader_params=reader_params)
        processor.train(input_path)
        processor.process(input_path, output_path)
Example #5
0
class SecondPassCrawler:
    session = attr.ib()
    navigator = attr.ib(init=False)
    file_processor = attr.ib(init=False)
    letter = attr.ib()

    def __attrs_post_init__(self):
        logging.info('attrs post init called')
        self.navigator = SeleniumNavigator(loading_strategy='none',
                                           letter=self.letter)
        self.file_processor = FileProcessor(letter=self.letter)

    def exit(self):
        self.navigator.close_browser()
        self.session.close()

    def get_urls(self):
        _ids = self.session.query(Candidate).filter(
            Candidate.Lastname.ilike('zorn')).all()
        #ids_ = \
        #self.session.query(Candidate).filter(Lastname.like("%z%")).all()
        reports = []
        for _id in _ids:
            results = \
            self.session.query(Report).filter_by(CandidateId=_id.CandidateId).all()
            logging.info(results)
            for result in results:
                reports.append((result.CandidateId, result.Url))
        return reports
        #return ['http://media.ethics.ga.gov/search/Campaign/Campaign_ReportOptions.aspx?NameID=16067&FilerID=C2012000744&CDRID=59991']

    def add_scrapelog_to_db(self, _id, url, content, dtime):
        slog = ScrapeLog(CandidateId=_id,
                         ScrapeDate=dtime,
                         RawData=content,
                         PageURL=url)
        try:
            self.session.add(slog)
            self.session.commit()
        except Exception as e:
            self.session.rollback()
            logging.info(e)

    def crawl_download_link(self, _id):
        parser = CSVLinkParser(self.navigator.page_source())
        parsed_link = parser.parse()
        if parsed_link is not None:
            logging.info(f'Parsed link: {parsed_link}')
            url = self.navigator.get_current_url()
            self.navigator.click_link(parsed_link)
            logging.info('Clicking download link for csv file.')
            content, dtime = self.file_processor.process()
            logging.info('Adding scrapelog to database')
            self.add_scrapelog_to_db(_id, url, content, dtime)
            self.file_processor.delete_csv()

    def crawl_view_contributions_ids(self, _id):
        logging.info(f'Current page: {self.navigator.get_current_url()}')
        parser = ContributionsViewParser(self.navigator.page_source())
        parsed_link = parser.parse()
        if parsed_link is not None:
            logging.info(f'Parsed link: {parsed_link}')
            self.navigator.click_link(parsed_link)
            self.navigator.wait_for_csv_link()
            self.crawl_download_link(_id)

    def crawl(self):
        for _id, url in self.get_urls():
            logging.info(f'Current url: {url}')
            self.navigator.navigate(url)
            self.navigator.wait_for_contributions_id()
            self.crawl_view_contributions_ids(_id)
Example #6
0
parser.add_argument(
    '-o', '--output',
    default=os.path.join(DATA_DIR, OUTPUT_FILE),
    help='Path to file, where processed data will be stored.',
)
parser.add_argument(
    '-c', '--chunk_size',
    default=128,
    help='Rows to process at a time.',
)
args = parser.parse_args()


if __name__ == '__main__':
    train_path = args.train
    test_path = args.input
    output_path = args.output
    reader_params = {
        'chunksize': args.chunk_size,
        'sep': SEPARATOR,
    }

    transformers = (
        Standardizer,
        MaxIndex,
        MaxFeatureAbsMeanDiff,
    )
    processor = FileProcessor(transformers, reader_params=reader_params)
    processor.train(train_path)
    processor.process(test_path, output_path)