def process_file(file_path_input, file_path_output, i):
    if os.path.exists(file_path_output):
        return print('{}    {}  Already done'.format(now(), file_path_output))

    input_file = open(file_path_input, 'r')
    output_file = open(file_path_output, 'a')

    # every worker create separate file for a input file
    writer = csv.writer(output_file, delimiter='\t')

    # for every file we create seperate driver (100 URLs)
    driver = webdriver.PhantomJS(executable_path=path_to_phantomjs)

    for line in input_file:
        splited = line.split('\t')
        # property_type = splited[0]
        url = splited[1]
        print('{}   Process={}  Current url: {}'.format(now(), i, url))

        # start process for getting microformat properties
        temp_queue = Queue()
        # p = Process(target=get_microformat_properties_by_type, args=(url, property_type, temp_queue, i))
        p = Process(target=get_element_features,
                    args=(url, driver, temp_queue, i))
        print("{}   {}  Process={}  {}  {}".format(now(), i, "Started: ",
                                                   "feature extraction", url))
        p.start()
        event_features = temp_queue.get(timeout=TIME_OUT_FEATURE)
        # try:
        #     pass
        # except Empty:
        #     print("{}   {}  Process={}  {}  {}".format(now(), i, "Timed out on: ", "feature extraction", url))

        if p.is_alive():
            p.terminate()

        print("Event features:" + str(event_features))

        if event_features is not None:
            print("{}   Process={}  Got properties for  {}".format(
                now(), i, url))

            # start process for feature extraction and writing to separate file
            # p_event_features = Process(target=get_event_features_and_write,
            #                            args=(event_features, driver, writer, i, output_file))
            #
            p_event_features = Process(target=write_element_features,
                                       args=(event_features, writer, i,
                                             output_file))
            p.start()
            # start_with_timeout(p_event_features, TIME_OUT_LOAD, "feature writing", url, i)
            if p_event_features.is_alive():
                p.terminate()

    driver.service.process.send_signal(signal.SIGTERM)
    driver.quit()
    return 'done'
Esempio n. 2
0
def worker(file_ids, i):
    for file_id in file_ids:
        file_path_input = get_filepath(file_id)
        file_path_output = get_filepath(file_id, input=False)
        try:
            print('{}   Process={}  Started to process file {}'.format(
                now(), i, file_path_input))
            process_file(file_path_input, file_path_output, i)
        except Exception as e:
            print('{}   Process={}  Bad file!    {}'.format(now(), i, e))
            continue
def start_with_timeout(process, timeout, msg, url, i):
    process.join(timeout)
    if process.is_alive():
        print("{}   {}  Process={}  {}  {}".format(now(), i, "Timed out on: ",
                                                   msg, url))
        sys.stdout.flush()
        process.terminate()
Esempio n. 4
0
def process_file(file_path_input, file_path_output, i):
    if os.path.exists(file_path_output):
        return print('{}    {}  Already done'.format(now(), file_path_output))

    input_file = open(file_path_input, 'r')
    output_file = open(file_path_output, 'a')

    # every worker create separate file for a input file
    writer = csv.writer(output_file, delimiter='\t')

    # for every file we create seperate driver (100 URLs)
    driver = webdriver.PhantomJS(executable_path=path_to_phantomjs)

    for line in input_file:
        splited = line.split('\t')
        property_type = splited[0]
        url = splited[1]
        print('{}   Process={}  Current url: {}'.format(now(), i, url))

        # start process for getting microformat properties
        temp_queue = Queue()
        p = Process(target=get_microformat_properties_by_type,
                    args=(url, property_type, temp_queue, i))
        start_with_timeout(p, TIME_OUT_LOAD, "loading", url, i)

        event_properties = temp_queue.get() if not temp_queue.empty() else None
        if p.is_alive():
            p.terminate()
        if event_properties is not None:
            print("{}   Process={}  Got properties for  {}".format(
                now(), i, url))

            # start process for feature extraction and writing to separate file
            p_event_features = Process(target=get_event_features_and_write,
                                       args=(event_properties, driver, writer,
                                             i, output_file))
            start_with_timeout(p_event_features, TIME_OUT_FEATURE,
                               "feature extraction", url, i)
            if p_event_features.is_alive():
                p.terminate()

    return 'done'
Esempio n. 5
0
def process_url(driver, url, i):
    output_filename = "{}/{}_{}.csv".format(PATH_PARSED_FILES, 'all_elements',
                                            i)
    if os.path.exists(output_filename):
        print('{}   File already exists {}'.format(now(), output_filename))
        return

    try:
        driver.get(url)
    except:
        print('{}   The problem with url:   {}'.format(now(), url))
        return

    time.sleep(2)

    output_file = open(output_filename, 'a')
    writer = csv.writer(output_file, delimiter='\t')

    element_features = get_element_features(url, driver)
    write_element_features(element_features, writer, output_file)
Esempio n. 6
0
def get_element_features(url, driver):
    print('{}   Getting all element features for {}'.format(now(), url))
    driver.get(url)
    elements = driver.find_elements_by_xpath(
        "//*[not(contains(@style,'display:none')) and normalize-space(text())]"
    )
    element_features = []
    for element in elements:
        try:
            if element.tag_name in GOOD_TAGS and element.text != '':
                element_features.append(ElementFeature(element, url, driver))
        except Exception as e:
            print(e)

    return element_features
Esempio n. 7
0
def write_element_features(event_features, writer, output_file):
    print("{}   {}".format(now(), "Writing features"))
    for element_feature in event_features:
        row_1_part = [
            element_feature.url, 'not_event_element',
            element_feature.text_property, element_feature.xy_coords['x'],
            element_feature.xy_coords['y'],
            element_feature.block_size['height'],
            element_feature.block_size['width'], element_feature.tag, 'NaN',
            element_feature.num_siblings
        ]
        css_prop = element_feature.css_prop
        row_2_part = [css_prop.get(css_h, None) for css_h in css_header]
        row = row_1_part + row_2_part
        writer.writerow([str(s, "utf-8") for s in row])
        output_file.flush()