Example #1
0
def save_url_body(extractor):
    try:
        config.logger.info('extracting features for: ' + extractor.url)
        hash = get_md5_from_string(extractor.local_file_path)
        text = extractor.webscrap.get_body()
        with open(config.root_dir_data + 'marseille/input/' + hash + '.txt',
                  "w") as file:
            file.write(text)

    except Exception as e:
        config.logger.error(repr(e))
        raise
Example #2
0
 def get_number_of_arguments(self, url):
     try:
         urlcode = get_md5_from_string(url)
         with open(config.root_dir_data + 'marseille/output.json',
                   'r') as fh:
             dargs = json.load(fh)
         try:
             tot_args = dargs[urlcode]
             return tot_args
         except KeyError:
             config.logger.warn(
                 'this should not happen but lets move on for now, check marseille dump files/pre-processing!'
             )
             raise
     except Exception as e:
         config.logger.error(repr(e))
         raise
Example #3
0
def get_features_web(id, extractor, topic, query, rank, url, likert,
                     likert_mode, likert_avg, folder, name, export_html_tags):

    try:
        config.logger.info('processing file ' + str(id))
        if extractor.webscrap is None:
            extractor.call_web_scrap()

        if not extractor.error:

            #config.logger.debug('process starts for : ' + extractor.url)

            data = collections.defaultdict(dict)
            data['topic'] = topic
            data['query'] = query
            data['rank'] = rank
            data['url'] = url
            data['hash'] = get_md5_from_string(url)
            data['likert'] = likert  # Microsoft dataset
            data['likert_mode'] = likert_mode  # C3 dataset
            data['likert_avg'] = likert_avg  # C3 dataset
            data['html2seq'] = []

            err, out = extractor.get_final_feature_vector()
            config.logger.debug('total of features function errors: ' +
                                str(err))

            # text/
            data['features'] = out
            extractor.tot_feat_extraction_errors = err

            # save html?
            html_error = False
            if export_html_tags:
                with open(folder + 'html/' + name.replace('.pkl', '.txt'),
                          "w") as file:
                    content = str(extractor.webscrap.soup)
                    if content is not None:
                        file.write(content)
                    else:
                        html_error = True

            if html_error is False:
                try:
                    data['html2seq'] = get_html2seq(extractor)
                    joblib.dump(data, folder + 'ok/' + name)
                    config.logger.debug('OK: ' + extractor.url)
                    return data
                except:
                    Path(folder + 'error/' + name).touch()
                    config.logger.error('Err: ' + extractor.url)
            else:
                Path(folder + 'error/' + name).touch()
                config.logger.error('Err: ' + extractor.url)
        else:
            Path(folder + 'error/' + name).touch()
            config.logger.error('extractor error: ' + extractor.url)

    except Exception as e:
        Path(folder + 'error/' + name).touch()
        config.logger.error('exception: ' + extractor.url + ' - ' + repr(e))
Example #4
0
def __export_features_multi_proc_3c(exp_folder, ds_folder, export_html_tags,
                                    force):
    assert (exp_folder is not None and exp_folder != '')
    # get the parameters
    config.logger.info('reading 3C dataset...')
    df_sites = pd.read_csv(DATASET_3C_SITES_PATH,
                           na_values=0,
                           delimiter=',',
                           usecols=['document_id', 'document_url'])
    df_scores = pd.read_csv(DATASET_3C_SCORES_PATH,
                            na_values=0,
                            delimiter=';',
                            usecols=[
                                'average(documentevaluation_credibility)',
                                'mode(documentevaluation_credibility)',
                                'document_id'
                            ])

    df_sites.set_index('document_id', inplace=True)
    #df_scores.set_index('document_id', inplace=True)

    config.logger.info('creating job args...')
    job_args = []
    err = 0
    tot_proc = 0
    tot = 0
    cached = []
    for doc_index, row in df_sites.iterrows():
        tot += 1
        url = str(row[0])
        url_id = doc_index
        urlencoded = get_md5_from_string(url)
        name = urlencoded + '.pkl'
        folder = OUTPUT_FOLDER + exp_folder + ds_folder + 'features/'
        my_file = Path(folder + 'ok/' + name)
        my_file_err = Path(folder + 'error/' + name)
        if (not my_file.exists()
                and not my_file_err.exists()) or force is True:
            temp = df_scores['document_id'].isin([url_id])
            likert_mode = df_scores.loc[temp,
                                        'mode(documentevaluation_credibility)']
            likert_avg = df_scores.loc[
                temp, 'average(documentevaluation_credibility)']
            #likert_mode = df_scores.loc[url_id]['mode(documentevaluation_credibility)']
            #likert_avg = df_scores.loc[url_id]['average(documentevaluation_credibility)']
            fe = FeaturesCore(url)
            if fe.error is False:
                job_args.append(
                    (tot, fe, 0, 0, 0, url, 0, likert_mode, likert_avg, folder,
                     name, export_html_tags))  # -> multiple arguments
                tot_proc += 1
                if tot_proc > MAX_WEBSITES_PROCESS - 1:
                    config.logger.warn('max number of websites reached: ' +
                                       str(MAX_WEBSITES_PROCESS))
                    break
            else:
                err += 1
            if tot_proc % 100 == 0:
                config.logger.info('processing job args ' + str(tot_proc))
        elif my_file.exists():
            data = joblib.load(str(my_file))
            cached.append(data)

    config.logger.info(
        '%d job args created (out of %s): starting multi thread' %
        (len(job_args), len(df_sites)))
    config.logger.info('apart from the jobs, weve got %d errors' % (err))
    config.logger.info(str(multiprocessing.cpu_count()) + ' CPUs available')
    with Pool(processes=multiprocessing.cpu_count()) as pool:
        asyncres = pool.starmap(get_features_web, job_args)

    if len(cached) > 0:
        asyncres.extend(cached)

    config.logger.info('feature extraction done! saving...')
    name = 'features.complex.all.' + str(len(job_args) + len(cached)) + '.pkl'
    joblib.dump(asyncres,
                OUTPUT_FOLDER + exp_folder + ds_folder + 'features/' + name)
    config.logger.info('done! file: ' + name)
Example #5
0
def __export_features_multi_proc_microsoft(exp_folder, ds_folder,
                                           export_html_tags, force):

    assert (exp_folder is not None and exp_folder != '')
    assert (ds_folder is not None and ds_folder != '')

    try:

        df = pd.read_csv(DATASET_MICROSOFT_PATH, delimiter='\t', header=0)
        config.logger.info('creating job args for: ' + DATASET_MICROSOFT_PATH)
        job_args = []
        tot_proc = 0
        err = 0
        id = 0
        cached = []
        for index, row in df.iterrows():
            id += 1
            url = str(row[3])
            urlencoded = get_md5_from_string(url)
            name = urlencoded + '.pkl'
            folder = OUTPUT_FOLDER + exp_folder + ds_folder + 'features/'
            my_file = Path(folder + 'ok/' + name)
            my_file_err = Path(folder + 'error/' + name)
            if (not my_file.exists()
                    and not my_file_err.exists()) or force is True:
                topic = row[0]
                query = row[1]
                rank = int(row[2])
                likert = int(row[4])
                path = str(get_html_file_path(url))
                if path is not None:
                    fe = FeaturesCore(url, local_file_path=path)
                else:
                    fe = FeaturesCore(url)
                if fe.error is False:
                    job_args.append(
                        (id, fe, topic, query, rank, url, likert, 0, 0, folder,
                         name, export_html_tags))  # -> multiple arguments
                    tot_proc += 1
                    if tot_proc > MAX_WEBSITES_PROCESS - 1:
                        config.logger.warn('max number of websites reached: ' +
                                           str(MAX_WEBSITES_PROCESS))
                        break
                else:
                    err += 1

                if index % 100 == 0:
                    config.logger.info('processing job args ' + str(index))
                # extractors.append(fe) # -> single argument
            elif my_file.exists():
                data = joblib.load(str(my_file))
                cached.append(data)

        config.logger.info(
            '%d job args created (out of %s): starting multi thread' %
            (len(job_args), len(df)))
        config.logger.info('apart from the jobs, weve got %d errors' % (err))
        config.logger.info(
            str(multiprocessing.cpu_count()) + ' CPUs available')
        with Pool(processes=multiprocessing.cpu_count()) as pool:
            asyncres = pool.starmap(get_features_web, job_args)
            #asyncres = pool.map(get_features_web, extractors)

        if len(cached) > 0:
            asyncres.extend(cached)

        config.logger.info('feature extraction done! saving...')
        name = 'features.complex.all.' + str(len(job_args) +
                                             len(cached)) + '.pkl'
        joblib.dump(
            asyncres,
            OUTPUT_FOLDER + exp_folder + ds_folder + 'features/' + name)
        config.logger.info('done! file: ' + name + ' (' + OUTPUT_FOLDER +
                           exp_folder + ds_folder + 'features/)')

    except Exception as e:
        config.logger.error(repr(e))
Example #6
0
            sep="\t",
            na_values=0,
            low_memory=False,
            skiprows=1)

        #file = BENCHMARK_FILE_NAME_TEMPLATE % (BEST_CLS_2class, BEST_PAD_2class, 'bin')
        file = 'cls_decisiontreeclassifier_bin_0_bin.pkl'
        print('loading model: ' + file)
        clf = joblib.load(OUTPUT_FOLDER + 'exp003/3c/models/text_features/' +
                          file)
        encoder = joblib.load(config.enc_domain)

        for index, row in df_annotations_humans.iterrows():
            url = str(row[0])
            claim = row[1]
            likert = row[2]
            urlencoded = get_md5_from_string(url)
            extractor = FeaturesCore(url)
            if extractor.webscrap is None:
                extractor.call_web_scrap()

            if not extractor.error:
                out = extractor.get_final_feature_vector()
                out[3] = encoder.transform([out[3]])[0]
                del out[2]

                prediction = clf.predict([out])
                print(url, prediction[0])

    except:
        raise