Beispiel #1
0
def main(dir, *args):
    extract = Extract.Extract()
    extract.EXECUTE = E
    extract.DL = 0
    print DATE
    extract.run('EXTRACT', DATE, dir, *args)
    ex_outdirs = copy.copy(extract.OUTDIRS)
    extract.OUTDIRS = []
    L_key = map_tags(extract, ex_outdirs)
    map_outdirs = copy.copy(extract.OUTDIRS)
    extract.OUTDIRS = []
    merge_hdfs(extract, map_outdirs, L_key)
Beispiel #2
0
def evaluatee(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    rs = soup.prettify()
    r = rs.encode()
    e = Extract(url, "test", 1, rs, 1, rs, rs)
    e.DoEvaluate()
    abc = 0
    if (b'\xc2\xa9' in r):
        abc = 1
    else:
        abc = 0
    #print(e.phishScore.DomainName)
    links = soup.find_all("a")
    c = 0
    for i in links:
        if e.phishScore.DomainName in links:
            c += 1
    title = -1
    if e.phishScore.DomainName in soup.find_all("title"):
        title = 1

    X = [
        e.phishScore.HTTPSPresent, e.phishScore.DomainLength,
        e.phishScore.NonAlphabetical,
        rs.count(e.phishScore.DomainName), e.phishScore.OutsideRationInBody,
        abc, title
    ]  #e.phishScore.TitleContainDomainName]
    X = np.array(X)
    X = X.reshape(1, -1)

    classifier = pickle.load(open("phishing_model.pkl", 'rb'))
    predict = classifier.predict(X)
    #print(predict)
    return predict

    if predict == 0:
        print("Phishing Website")
    else:
        print("Legitimate site")
Beispiel #3
0
def user_entry():
    """
    Get user input from command line or from input file and run full program.
    """
    parser = argparse.ArgumentParser(
        prog='CLIMATE_ANALYSIS',
        formatter_class=argparse.RawTextHelpFormatter,
        description=
        """The functions will give statistical analysis of the climate data 
                                     presented
    FILENAMES FORMAT
    ----------------
    - The filenames should be in the format "{START OF FILENAME}_ens{NUM}_{YEAR}.nc", where {START OF FILENAME} is 
    the prefix of the file, this can be the algae type etc, {NUM} is the ensemble number and {YEAR} is the year. 
   OR if you have multiple years stored in one file then:
   - The filenames should be in the format "{START OF FILENAME}_ens{NUM}_{YEAR 1}_{YEAR 2}.nc", where 
   {START OF FILENAME} is the prefix of the file, this can be the algae type etc, {NUM} is the ensemble number and 
   {YEAR 1} and {YEAR 2} are the start and end year of the data in the file. 
   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    ASSUMPTIONS
    ------------
    - Files do not have overlapped data.
    - Daily increments of data, except if the monthly tag is set in the arguments.
    - Grids have constant latitude and longitude.
    ------------
    - Some example files are in the data folder.
    """)
    parser._optionals.title = "other arguments"
    parser.add_argument(
        '-pf',
        '--prefix',
        nargs='+',
        required=True,
        help=
        "<Required> This is the prefix of the file - in the filenames format section, this is the START OF FILENAME."
    )
    parser.add_argument('start_date',
                        nargs='+',
                        help="""Start date of analysis 
    Can be in the following formats:
    ----------------------------------
    YYYY-MM-DD : e.g. 2020-04-12
    YYYY-MM    : e.g. 2020-04
    YYYY       : e.g. 2020 
    - If day is not given, the 1st of the given month will be used i.e 2020-04 => 2020-04-01
    - If day and month is not given, 1st Jan will be used as the start date i.e 2020 => 2020-01-01"""
                        )
    parser.add_argument(
        '-end',
        '--end_date',
        nargs='*',
        help=
        """ <Not required> End date of analysis - format is the same as start_date
    -----------------------------------end_date not given-------------------------------------
    - If only start year is given, the end_date is automatically set to the 31 Dec of start year
    - If start year and month is given, then end_date is set to the end of the start month
       -----------------------------------end_date given-------------------------------------
    - If day is not given, the end of the given month will be used i.e 2020-04 => 2020-04-30
    - If day and month is not given, 31 Dec will be used as the end date i.e 2020 => 2020-12-31"""
    )
    parser.add_argument('-v',
                        '--vars',
                        nargs='+',
                        metavar="variables",
                        help="<Required> Variables of data to analyse",
                        required=True)
    parser.add_argument('-p',
                        '--plot',
                        nargs=1,
                        metavar=("ensemble_number"),
                        help="""Plot map, histogram and timeseries graphs
    E.g. --plot 1
    The ensemble to plot must be included. """)
    parser.add_argument('-m',
                        '--monthly',
                        action="store_true",
                        help="Data in file is stored in monthly increments.")
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-g',
                       '--grid',
                       nargs='+',
                       type=float,
                       metavar=("(lat, lon) or filename or linear/rotate"),
                       help="""
                       Grid Point: Latitude, Longitude
                        Uses grid point that latitude and longitude lies in.
                        Other commands:
                        - You can define a list of grid points in a .txt file e.g check INPUT/sample_points.txt
                           - Grid Point: sample_points.txt
                        - You can regrid to a grid (using nearest neighbour interpolation) defined in a NETCDF file:
                           - Grid Point: example_file.nc
                        Cannot be used in conjunction with sample point.
                       """)
    group.add_argument('-s',
                       '--sample',
                       nargs='+',
                       type=float,
                       metavar=("(lat, lon) or filename or linear/rotate"),
                       help="""
                       Sample Point: Latitude, Longitude
                        Uses sample point given by latitude and longitude using interpolation.
                        Other commands:
                        - You can define a list of sample points in a .txt file e.g check INPUT/sample_points.txt
                           - Sample Point: sample_points.txt
                        - You can regrid to a grid (using linear interpolation) defined in a NETCDF file:
                           - Sample Point: example_file.nc
                        Cannot be used in conjunction with grid point.
                       """)
    group.add_argument('-lc',
                       '--lon_centre',
                       nargs=1,
                       type=float,
                       help="Longitude to centre map on.")
    parser.add_argument('-mk',
                        '--mask',
                        nargs=1,
                        metavar="filename",
                        help="Uses masking grid given as a file "
                        "(contains boolean array to be imposed on "
                        "the global grid).")
    parser.add_argument(
        '-o',
        '--output',
        action="store_true",
        help=
        "If plot option selected, save data output of histogram and timeseries "
        "analysis in " + directories.ANALYSIS + " as a .dat file.")
    parser.add_argument('-cv',
                        '--covary',
                        action="store_true",
                        help="Analysis on how the variables given in -v "
                        "vary with each other.")
    parser.add_argument('-e',
                        '--ens',
                        nargs=1,
                        type=int,
                        metavar="number_of_ensembles",
                        help="<Required> The number of ensembles of the data. "
                        "If not set, the default value = 1",
                        required=True)
    parser.add_argument(
        '-ht',
        '--hist',
        nargs='*',
        metavar="number_of_bins_in_histogram",
        help=" Options for bin size selection. If not set, the "
        "default value = fd (Freedman "
        "Diaconis Estimator). The list of the potential "
        "options are listed in: \n"
        "https://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram_bin_edges.html#numpy.histogram_bin_edges"
    )
    parser.add_argument(
        '-u',
        '--user',
        nargs=2,
        metavar=('file_name', 'function_name'),
        help=
        """Use function written by the user and stored in user_function folder for analysis. 
                        file_name : name of file that contains function in user_function folder
                        function_name : name of function to call 
                        Note: user functions are expected to only take in a cube as an argument. An example of a function 
                        can be found in user_function/example_function.py
                        """)
    parser.add_argument('-a',
                        '--analysis',
                        nargs='+',
                        help="""Analysis performed on data set.
    If not specified, then all analysis listed below will be performed.
    Types of analysis:
    - mean
    - std (Standard deviation)
    - rms (Root mean squared error)
    - median
    You can also select a combination of analysis to perform e.g. -a mean rms """
                        )
    parser.add_argument('-sp',
                        '--spatial',
                        action="store_true",
                        help="Calculates averages spatially.")
    parser.add_argument(
        '-ca',
        '--areas',
        action="store_true",
        help="Calculate areas of grid boxes of latitude and"
        " longitude and saves to NetCDF file areas.nc in results folder")
    parser.add_argument(
        '-t',
        '--total',
        action="store_true",
        help=
        """Total ensemble stats: True/False : The analysis will be performed over the whole ensemble given.
                        - If set True, all the ensembles will be averaged as a collection.
                        - If set False, the ensembles will be averaged individually."""
    )
    parser.add_argument('-i',
                        '--index',
                        metavar=('index'),
                        help="""Calculate index given 
                            The control run is the FIRST file prefix set and the corresponding start/end date. 
                            The future run is the SECOND file prefix set and the corresponding second start/end date
                            Types of inidices that can be calculated:          
                            enso : The Oceanic Niño Index (ONI) 
                            nino12 : Niño 1+2 Index
                            nino4 : Niño 4 Index
                            tni : The Trans-Niño Index (TNI)
                            iod : Indian Ocean Dipole (IOD) Mode Index 
                            amo : Atlantic Multidecadal Oscillation (AMO) Index
                            pdo : Pacific Decadal Oscillation (PDO) Index 
                            ao : Arctic Oscillation (AO; Northern Annular Mode) Index 
                            aao : Antarctic Oscillation (AAO; Southern Annular Mode) Index 
                            nao : North Atlantic Oscillation (NAO) Index
                            """)
    # Log output
    old_stdout = sys.stdout
    log_file = open("output.log", "w")
    sys.stdout = log_file

    # Init progress bar
    sys.stdout = old_stdout
    progress = ProgressBar(n_iter=5,
                           total_width=25,
                           description='Climate Modelling software output')
    sys.stdout = log_file
    # Initialise the variables
    algae_type, start, varbs, ens, end, analysis, spatial, total = None, None, None, None, None, None, None, None
    plot, monthly, grid, sample, mask, output, covary, hist = None, None, None, None, None, None, None, None
    lon_centre, func, calc_areas, index, lat, lon, points_sample_grid = None, None, None, None, None, None, None
    second_date_given, start2, end2 = False, None, None

    # If no arguments are given, use input file
    if len(sys.argv) == 1:
        algae_type, start, varbs, ens, end, analysis, spatial, total, plot, monthly, grid, sample, mask, output, covary, hist, lon_centre, func, calc_areas, index = file_entry(
        )
    elif len(sys.argv) == 2 and (sys.argv[1] == '-ex'
                                 or sys.argv[1] == '--example'):
        algae_type, start, varbs, ens, end, analysis, spatial, total, plot, monthly, grid, sample, mask, output, covary, hist, lon_centre, func, calc_areas, index = file_entry(
            example=True)
    else:
        # Arguments
        args = parser.parse_args()

        algae_type = args.prefix
        start = args.start_date
        varbs = args.vars
        ens = args.ens[0]
        end = args.end_date
        analysis = args.analysis
        spatial = args.spatial
        total = args.total
        plot = args.plot
        monthly = args.monthly
        grid = args.grid
        sample = args.sample
        mask = args.mask
        output = args.output
        covary = args.covary
        hist = args.hist
        lon_centre = args.lon_centre
        func = args.user
        calc_areas = args.areas
        index = args.index

    # Update progress after getting input from user
    sys.stdout = old_stdout
    progress.update()
    sys.stdout = log_file
    # Get command line arguments
    argv = 'python main.py'
    argv = argv + ' ' + start[0]
    if len(start) == 2:
        argv = argv + start[1]
    argv = argv + ' -pf ' + algae_type[0]
    if len(algae_type) == 2:
        argv = argv + ' ' + algae_type[1]
    if end:
        argv = argv + ' -end ' + end[0]
        if len(end) == 2:
            argv = argv + ' ' + end[1]
    av = ' '.join(varbs)
    argv = argv + ' -v ' + av + ' -e ' + str(ens)

    if end and len(start) < len(end):
        print("ERROR in function user_entry: Start dates are required.")
        sys.exit()

    if len(algae_type) > 2:
        print(
            "ERROR in function user_entry: Too many arguemnts given for 'Prefix' argument."
        )
        sys.exit()

    if spatial and not analysis:
        print(
            "Error in function user_entry: Spatial argument cannot be set when no analysis is selected."
        )
        sys.exit()

    # All dates
    day_s, mon_s, yr_s, day_e, mon_e, yr_e = None, None, None, None, None, None
    day_s2, mon_s2, yr_s2, day_e2, mon_e2, yr_e2 = None, None, None, None, None, None
    # Get split start date
    if len(start) == 1:
        day_s, mon_s, yr_s = get_date(start[0])
        if not end:  # If end date not given, use the end of start year
            if StartBools.just_start_year:
                end = str(yr_s)
            elif StartBools.just_start_year_month:
                end = str(yr_s) + "-" + str(mon_s)
        else:
            end = end[0]
        # Get split end date
        day_e, mon_e, yr_e = get_date(end, start=False)

    # 2 end years must be given with 2 start years
    if len(start) == 2 and len(end) != 2:
        print(
            "ERROR in function user_entry: Both end dates must be given with both start dates."
        )
        sys.exit()

    # If extra year is given
    if len(start) == 2:
        second_date_given = True
        # Get first start date
        StartBools.just_start_year, StartBools.just_start_year_month = False, False
        day_s, mon_s, yr_s = get_date(start[0])

        # Get first end date
        fst_end = end[0]
        day_e, mon_e, yr_e = get_date(fst_end, start=False)

        # Get next start
        day_s2, mon_s2, yr_s2 = get_date(start[1])

        # Get next end date
        end = end[1]
        day_e2, mon_e2, yr_e2 = get_date(end, start=False)
    elif len(start) > 2:
        print(
            "ERROR in function user_entry: Too many arguemnts given for 'Start date' argument."
        )
        sys.exit()

    # Print user input
    print("Arguments:")
    if len(algae_type) == 1:
        print("- file prefix: ", algae_type[0])
    if len(algae_type) == 2:
        print("- first file prefix: ", algae_type[0])
        print("- second file prefix: ", algae_type[1])
    print("- variables: ", varbs)
    print("- start date: " + str(yr_s) + "-" + str(mon_s) + "-" + str(day_s))
    print("- end date: " + str(yr_e) + "-" + str(mon_e) + "-" + str(day_e))
    if second_date_given:
        print("- second start date: " + str(yr_s2) + "-" + str(mon_s2) + "-" +
              str(day_s2))
        print("- second end date: " + str(yr_e2) + "-" + str(mon_e2) + "-" +
              str(day_e2))

    # Check that dates are in valid order
    is_valid = check_valid_order([day_s, mon_s, yr_s], [day_e, mon_e, yr_e])
    if not is_valid:
        print("ERROR in function user_entry: Invalid start and end date")
        print("  - The end date is earlier than the start date")
        sys.exit()
    if second_date_given:
        is_valid = check_valid_order([day_s2, mon_s2, yr_s2],
                                     [day_e2, mon_e2, yr_e2])
        if not is_valid:
            print(
                "ERROR in function user_entry: Invalid second start and second end date"
            )
            print("  - The end date is earlier than the start date")
            sys.exit()
    print("Number of ensembles:", ens)

    if analysis:
        print("Analysis: ", analysis)
        a_ = ' '.join(analysis)
        argv = argv + ' -a ' + a_
        check_analysis(analysis)
    if spatial:
        print("Spatial analysis option selected.")
        argv = argv + ' -sp'
    if total:
        print("Total ensemble stats option selected.")
        argv = argv + ' -t'
    if plot:
        print("Plotting option selected.")
        argv = argv + ' -p ' + str(plot[0])
    else:
        plot = None
    if monthly:
        print("Monthly date expected.")
        argv = argv + ' -m'

    if grid:
        if len(grid) == 2:
            lat, lon = grid[0], grid[1]
            print("Grid point option selected.")
            argv = argv + ' -g ' + str(grid[0]) + ' ' + str(grid[1])
        elif len(grid) == 1:
            # Check if txt or nc file or linear or rotate
            check_sample_grid_one_arg(grid, 'user_entry')
            points_sample_grid = grid[0]
            print("Grid point option selected.")
            argv = argv + ' -g ' + str(grid[0])
        else:
            print(
                "ERROR in function user_entry: Grid point argument has invalid number of arguments."
            )
            sys.exit()
    elif sample:
        if len(sample) == 2:
            lat, lon = sample[0], sample[1]
            print("Sample point option selected.")
            argv = argv + ' -s ' + str(sample[0]) + ' ' + str(sample[1])
        elif len(sample) == 1:
            # Check if txt or nc file or linear or rotate
            check_sample_grid_one_arg(sample, 'user_entry')
            points_sample_grid = sample[0]
            print("Sample point option selected.")
            argv = argv + ' -s ' + str(sample[0])
        else:
            print(
                "ERROR in function user_entry: Sample point argument has invalid number of arguments."
            )
            sys.exit()

    if mask:
        if isinstance(mask, list):
            mask = mask[0]
        print("Masking grid option selected.")
        argv = argv + ' -mk ' + mask
    elif not mask:
        mask = None
    if output:
        print("Save analysis data output selected.")
        argv = argv + ' -o'
    if covary:
        print("Co-varying option selected.")
        argv = argv + ' -cv'
        check_variables_covary(varbs)

    if not hist:
        hist = ['fd']
    elif hist:
        argv = argv + ' -ht ' + str(hist[0])
        if len(hist) == 2:
            argv = argv + ' ' + str(hist[1])
        elif len(hist) > 2:
            print(
                "ERROR in function user_entry: Histogram argument has invalid number of arguments."
            )
            sys.exit()

    print("Histogram bin selection option:", hist)

    if func:
        print("User function given: " + str(func[0]) + ", " + str(func[1]))
        argv = argv + ' -u ' + func[0] + ' ' + func[1]

    if calc_areas:
        print("Calculate areas option selected.")
        argv = argv + ' -ca'

    # Check index is given with second date
    if index and not second_date_given:
        print(
            "ERROR in function user_entry: Index must be given with a second start date set."
        )
        sys.exit()

    if index:
        print("Index option selected: " + index)
        argv = argv + ' -i'

    if lon_centre:
        lon_centre = lon_centre[0]
        print("Longitude centering option selected.")
        argv = argv + ' -lc ' + str(lon_centre)
    elif not lon_centre:
        lon_centre = None

    # Call functions to perform analysis
    start = [day_s, mon_s, yr_s]
    end = [day_e, mon_e, yr_e]
    if second_date_given:
        start2 = [day_s2, mon_s2, yr_s2]
        end2 = [day_e2, mon_e2, yr_e2]

    # Update progress after preocessing input from user
    sys.stdout = old_stdout
    progress.update()
    sys.stdout = log_file

    # Calculate indices
    if index:  # Self contained action
        calculate_index(algae_type,
                        index,
                        varbs,
                        start,
                        end,
                        start2,
                        end2,
                        monthly=monthly,
                        test=True)
        # Update progress after calculating index
        sys.stdout = old_stdout
        progress.update()
        sys.stdout = log_file
        progress.finish()
        sys.exit()

    # EXTRACT DATA FROM FILES
    extract = Extract(algae_type[0],
                      varbs,
                      start,
                      end,
                      ens,
                      monthly=monthly,
                      lat=lat,
                      lon=lon,
                      grid=grid,
                      points_sample_grid=points_sample_grid,
                      lon_centre=lon_centre,
                      maskfile=mask,
                      calc_areas=calc_areas)
    saved, ens_files, abs_files, full_saved, dim_coords = extract.extract_data(
    )

    saved2, ens_files2, abs_files2, full_saved2 = None, None, None, None
    if second_date_given:
        at = None
        if len(algae_type) == 2:
            at = algae_type[1]
        else:
            at = algae_type[0]
        extract = Extract(at,
                          varbs,
                          start2,
                          end2,
                          ens,
                          monthly=monthly,
                          lat=lat,
                          lon=lon,
                          grid=grid,
                          points_sample_grid=points_sample_grid,
                          lon_centre=lon_centre,
                          maskfile=mask,
                          calc_areas=calc_areas)
        saved2, ens_files2, abs_files2, full_saved2, _ = extract.extract_data()

    # Update progress after extracting data
    sys.stdout = old_stdout
    progress.update()
    sys.stdout = log_file

    # COMPUTE ANALYSIS
    anlys = Analysis(saved)
    ens_stats, func_name, analysis_str, nan_indices = None, None, None, None
    spat_calcs, spat_calcs2 = None, None
    ens_stats2 = None
    if func:  # user analysis
        file_name, func_name = func[0], func[1]
        ens_stats = anlys.compute_user_analysis(file_name, func_name)
    else:
        if second_date_given:
            ens_stats, spat_calcs, spat_calcs2, analysis_str, nan_indices = anlys.calc_stats_difference(
                saved2,
                analysis,
                total=total,
                spatial=spatial,
                dim_coords=dim_coords)
        else:
            ens_stats, analysis_str, nan_indices = anlys.compute_stats_analysis(
                analysis, total=total, spatial=spatial, dim_coords=dim_coords)

    # Warning for mask and sample/grid
    if mask is not None and lat is not None:
        print(
            "WARNING: Please ensure that sample/grid point is in the masked region."
        )

    # Update progress after computing analysis
    sys.stdout = old_stdout
    progress.update()
    sys.stdout = log_file

    # # PLOTTING
    # try:
    #     if plot is not None or output:
    #         plot_ens_num = int(plot[0]) if plot is not None else 1

    #         # Plot histogram
    #         create_histogram(saved, ens_stats, start, end, varbs, sel=hist, monthly=monthly,
    #                          save_out=output, ens_num=plot_ens_num, cov=covary, mask=mask,
    #                          total=total, analysis_str=analysis_str, nan_indices=nan_indices, plot=plot,
    #                          second_date_given=second_date_given, start_date2=start2, end_date2=end2, spatial=spatial)

    #         # Only plot timeseries and map if plot is enabled
    #         if plot is not None:
    #             # Only plot map of analysis if using analysis: mean, median, std or rms and NOT grid/sample point
    #             if analysis_str:
    #                 if func is None or not func:
    #                     plot_map_analysis(ens_stats, varbs, save_out=output, ens_num=plot_ens_num,
    #                                       analysis_str=analysis_str, total=total,
    #                                       second_date_given=second_date_given)
    #                 else:
    #                     print("WARNING: Map not plotted as user function is used.")
    #             else:
    #                 plot_map(saved, varbs, save_out=output, ens_num=plot_ens_num, total=total,
    #                          second_date_given=second_date_given)

    #         # Plot time series and boxplot
    #         if analysis_str:
    #             create_timeseries_analysis(ens_stats, start, end, varbs, analysis_str, monthly=monthly,
    #                                        save_out=output, ens_num=plot_ens_num,
    #                                        second_date_given=second_date_given, total=total, spatial=spatial,
    #                                        calcs=spat_calcs, calcs2=spat_calcs2, plot=plot)
    #         else:
    #             create_timeseries(saved, start, end, varbs,
    #                               save_out=output, ens_num=plot_ens_num, func_name=func_name, monthly=monthly,
    #                               second_date_given=second_date_given, plot=plot)
    #         # Update progress after plotting
    #         progress.update()

    # except Exception as err:
    #     print("Exception thrown in function user_entry when plotting: " + str(err))

    # WRITE ANALYSIS TO NETCDF FILE
    if output:
        wo = WriteOutput(ens_files,
                         abs_files,
                         ens_stats,
                         analysis_str,
                         varbs,
                         start,
                         end,
                         argv,
                         saved,
                         full_saved,
                         total=total,
                         lon_centre=lon_centre,
                         mask=mask,
                         lon=lon,
                         lat=lat,
                         grid=grid,
                         user_func=func_name,
                         points_sample_grid=points_sample_grid,
                         second_date_given=second_date_given,
                         test=True)
        wo.write_analysis_to_netcdf_file()
        # Update progress after writing output
        sys.stdout = old_stdout
        progress.update()
        sys.stdout = log_file

    print("PROGRAM SUCCESSFUL - TERMINAL FINISHED.")
    # End logging
    sys.stdout = old_stdout
    log_file.close()

    # Print to terminal when finished
    print("")
    print_end_statement()

    progress.finish()
Beispiel #4
0
    # Renaming final dataframe columns and re-organizing columns
    def _clean_final_dataframe(self):
        self.final_dataframe = self.final_dataframe[[
            'gtin', 'cnpj', 'response.nome',
            'response.gepirParty.partyDataLine.address.city', 'response.uf',
            'description', 'category'
        ]]

        self.final_dataframe = self.final_dataframe.rename(
            columns={
                'response.nome': 'razao_social',
                'response.gepirParty.partyDataLine.address.city': 'cidade',
                'response.uf': 'estado'
            })


# To test this file
if __name__ == '__main__':
    import Extract

    ex = Extract.Extract('../../data/input')
    ex.extract_bases()

    tr = Transform(ex.dataframe_info_mix, ex.dataframe_descricoes,
                   ex.dataframe_gs1, ex.dataframe_cnpj, ex.dataframe_cosmos)

    tr.transform_dataframes()

    print(tr.final_dataframe.head())
#!/usr/bin/env python3

import threading
import cv2
import numpy as np
import base64
import queue
from Extract import *
from Convert import *
from Display import *

# filename of clip to load
filename = 'clip.mp4'
lock = threading.Lock()
# shared queue
extractionQueue = queue.Queue(10)
sendingQueue = queue.Queue(10)
flagQueue = queue.Queue(2)
flagQueue.put(True)
flagQueue.put(True)

threadExtraction = Extract(filename, extractionQueue, lock, flagQueue)
threadExtraction.start()
threadConvert = Convert(extractionQueue, sendingQueue, lock, flagQueue)
threadConvert.start()
threadDisplay = Display(sendingQueue, flagQueue)
threadDisplay.start()
Beispiel #6
0
 def __init__(self, filename='testFile.txt'):
     self.filename = filename
     self.extractObject = Extract(self.filename)
     self.nodes = self.extractObject.nodes
Beispiel #7
0
    def explore_link(self, url):

        num_URLs_popped = 0
        print("Started out with the root url that is: {}".format(url))
        navigation_info = Navigation_Information()
        current_path_number = navigation_info.counter_for_paths
        # Start The counting for paths
        print("The count is")
        print(navigation_info.counter_for_paths)
        # create current queue
        queue = Queue_of_Links()
        # add the root url
        if url == self.root:
            queue.add_to_queue(url, "student health center url")
            print("root appended to queue.current_queue")
            print("current queue is : {}".format(queue.current_queue))
            dictionary = navigation_info.update_dictionary_of_paths(url)
            print(dictionary)

        while queue.current_queue:
            if num_URLs_popped > 25:
                content = '-'
                navigation_info.generate_result(df, num, self.root,
                                                self.LARC_found,
                                                self.LARCflag_and_terms,
                                                current_url, current_url_Text,
                                                content)

                break
            else:
                current_path = navigation_info.get_current_path()
                print(
                    "num_URLs_popped(No of urls that have been popped until now)"
                )  #No of urls that have been popped until now
                print(num_URLs_popped)
                current_url, current_url_Text = queue.pop_from_queue()
                num_URLs_popped = num_URLs_popped + 1
                print("Current Url")
                print(current_url)
                print("has been popped")
                ExtractObject = Extract(driver, current_url)

                response_code = check_response_code(current_url)

                if (response_code == 1):
                    print(
                        "Exception while getting content with requests library"
                    )

                elif (response_code == 400) or (response_code == 401) or (
                        response_code == 402) or (response_code == 403) or (
                            response_code
                            == 405) or (response_code
                                        == 406) or (response_code
                                                    == 407) or (response_code
                                                                == 408):
                    print(
                        "Error response code while while getting content with requests library"
                    )

                elif (response_code == 0):
                    print("Okay to get content with requests library")
                self.scraping_attempted = True

                content = ExtractObject.get_content()
                if content == 1:
                    print(
                        "Exception"
                    )  #probably to many get requests to a website from same IP, and a timer or skip this URL or end the program
                    continue

                external_links = ExtractObject.get_ext_links()
                print("########printing all the external links#######")
                print(external_links)

                self.LARCflag_and_terms = navigation_info.check_for_LARC(
                    content)
                self.LARC_found = self.LARCflag_and_terms[0][0]
                if self.LARC_found:
                    navigation_info.generate_result(df, num, self.root,
                                                    self.LARC_found,
                                                    self.LARCflag_and_terms,
                                                    current_url,
                                                    current_url_Text, content)
                    break

                else:
                    print(
                        "Exploring these links form the current page"
                    )  # A mention of LARC was not found on the current page and it is now exploring all the links on the current page that could lead you to LARC resources
                    links = ExtractObject.get_links_byPartialText(
                        vocab_for_findByPartialText)
                    print("This is the total no. of links got in url %s" %
                          (current_url))
                    print(len(links))

                    found_linkTexts = []
                    found_linkHrefs = []
                    found_linkDict = {}

                    for link in links:
                        try:
                            link_valid = ExtractObject.check_link(link)
                            if link_valid == True:
                                if link.text not in found_linkTexts:
                                    found_linkTexts.append(link.text)
                                    found_linkHrefs.append(
                                        link.get_attribute("href"))
                                    found_linkDict[
                                        link.text] = link.get_attribute("href")
                        except:
                            continue
                    print("spawn_length(considers unique links)")

                    spawn_length = len(found_linkTexts)
                    print(spawn_length)
                    dictionary = navigation_info.update_dictionary_of_paths(
                        "_", spawn_length, found_linkTexts)
                    print("dictionary")
                    print(dictionary)

                    for i, k in enumerate(found_linkHrefs):
                        if k not in queue.current_queue:
                            queue.add_to_queue(k, found_linkTexts[i])

                    found_linkTexts = []
                    found_linkHrefs = []
                    found_linkDict = {}
                    links = []
                    print("queue.current_queue")
                    print(queue.current_queue)
                    print("queue.current_queue_text")
                    print(queue.current_queue_text)

        #Out of the bfs loop
        print("out of the loop")

        if (queue.current_queue == []):
            print("Nothing to write to file")
            current_url = '-'
            current_url_Text = '-'
            content = '-'
            navigation_info.generate_result(df, num, self.root,
                                            self.LARC_found,
                                            self.LARCflag_and_terms,
                                            current_url, current_url_Text,
                                            content)

        if self.TimeOutException_flag == True:
            if self.scraping_attempted == True:
                current_url = '-'
                current_url_Text = '-'
                content = '-'
                navigation_info.generate_result(df, num, self.root,
                                                self.LARC_found,
                                                self.LARCflag_and_terms,
                                                current_url, current_url_Text,
                                                content)

            else:
                current_url = '-'
                current_url_Text = '-'
                content = '-'
                print("Nothing to write to file")
                navigation_info.generate_result(df, num, self.root,
                                                self.LARC_found,
                                                self.LARCflag_and_terms,
                                                current_url, current_url_Text,
                                                content)