Exemple #1
0
                    else:
                        url_location_dictionary[url]= tld.replace(".",'').upper() 
                except KeyError:
                    print "no entry found for: "+ str(tld)

        return url_location_dictionary

if __name__ == '__main__':
    # generate help text for arguments
    parser = argparse.ArgumentParser(description='Extracts locations (country ISO code) from the top-level domains of URLs given a JSON file containing Wikipedia articles and URLs referenced by them')
    parser.add_argument('input', help='a file path to the input JSON file')
    parser.add_argument("--output", dest="output", metavar='output path', type=str, required=True)
    parser.add_argument("--world_fact_book_database", dest="world_fact_book_database", metavar='path to world fact book database', type=str, required=True)
    parser.add_argument("--IANA_database", dest="iana_database", metavar='path to IANA database', type=str, required=True)
    args = parser.parse_args()
    
    inputfile_path=args.input
    outputfile_path=args.output
    wfbdatabase_path=args.world_fact_book_database
    ianadatabase_path=args.iana_database
    
    print "running tld_location_extraction"
    
    # load json input
    with open(inputfile_path) as json_input:    
        json_data = json.load(json_input)
    
    tld_location_extraction = TLDLocationExtraction(ianadatabase_path,wfbdatabase_path)
    url_location_dictionary = tld_location_extraction.get_tld_locations(json_data)
    json_writer.write_json_file(url_location_dictionary, outputfile_path)
                    continue
                except Exception as exception:
                    print "Continue after " + exception.__class__.__name__ + " for URL: " + url 
                    continue

        return url_location_dictionary
    

if __name__ == '__main__':
    # generate help text for arguments
    parser = argparse.ArgumentParser(description='Extracts locations (country ISO code) from the IP-address of URLs given a JSON file containing Wikipedia articles and URLs referenced by them')
    parser.add_argument('input',
                       help='a file path to the input JSON file')
    parser.add_argument("--output", dest="output", metavar='output path', type=str, required=True)
    parser.add_argument("--database", dest="database", metavar='path to mmdb country database', type=str, required=True)
    args = parser.parse_args()
    
    inputfile_path=args.input
    outputfile_path=args.output
    databse_path=args.database
    
    print "running ip_location_extraction"
    
    # load json input
    with open(inputfile_path) as json_input:    
        json_data = json.load(json_input)
    
    ip_location_extraction=IPLocationExtraction(databse_path)
    url_location_dictionary=ip_location_extraction.get_ip_locations(json_data)
    json_writer.write_json_file(url_location_dictionary, outputfile_path)
            
            article_count += 1
            
            # print article_count

if __name__ == '__main__':
    # generate help text for arguments
    parser = argparse.ArgumentParser(description='Extracts geo locations from a list of wikipedia articles given in JSON.')
    parser.add_argument('input',
                               help='a file path to a JSON file containing wikipedia article names')
    parser.add_argument("--output", dest="output", metavar='output path', type=str)
    parser.add_argument("--language", dest="language", metavar='two-letter country code', type=str, help="on of the language editions of dbpedia (default: en):", required=True)
    parser.add_argument("--threshold", dest="threshold", metavar='threshold for majority voting', type=float, help="absolute threshold for majority voting on coordinates (default: 0.1)", required=True)
            
    args = parser.parse_args()
            
    inputfile_path=args.input
    outputfile_path=args.output
    language = args.language
    threshold = args.threshold
    
    # load json input
    with open(inputfile_path) as json_input:    
        json_data = json.load(json_input)

    print "running wikipedia_location_extraction"

    wikipedia_location_extraction = WikipediaLocationExtraction(language)
    article_url_dictionary = wikipedia_location_extraction.get_wikipedia_languages(json_data)
    json_writer.write_json_file(article_url_dictionary, outputfile_path)
Exemple #4
0
        metavar='two-letter country code',
        type=str,
        help="on of the language editions of dbpedia (default: en):",
        required=True)
    parser.add_argument(
        "--threshold",
        dest="threshold",
        metavar='threshold for majority voting',
        type=float,
        help=
        "absolute threshold for majority voting on coordinates (default: 0.1)",
        required=True)

    args = parser.parse_args()

    inputfile_path = args.input
    outputfile_path = args.output
    language = args.language
    threshold = args.threshold

    # load json input
    with open(inputfile_path) as json_input:
        json_data = json.load(json_input)

    print "running wikipedia_location_extraction"

    wikipedia_location_extraction = WikipediaLocationExtraction(language)
    article_url_dictionary = wikipedia_location_extraction.get_wikipedia_languages(
        json_data)
    json_writer.write_json_file(article_url_dictionary, outputfile_path)
    # generate help text for arguments
    parser = argparse.ArgumentParser(description='Extracts URLs from a given wikipedia url and calls the feature collection functions for the urls')
    parser.add_argument('url',
                       help='a Wikipedia URL for which the features are calculated')
    parser.add_argument("--geodatabase", dest="geodatabase", help='path to mmdb country database', type=str, required=True)
    parser.add_argument("--world_fact_book_database", dest="world_fact_book_database", help='path to world fact book database', type=str, required=True)
    parser.add_argument("--IANA_database", dest="iana_database", help='path to IANA database', type=str, required=True)
    parser.add_argument("--model-data", dest="model_data", metavar='path to model-data-directory', type=str, required=True)
    parser.add_argument("--output", dest="output", help='output folder', type=str, required=True)
    args = parser.parse_args()
    
    geodatabase_path=args.geodatabase
    wfbdatabase_path=args.world_fact_book_database
    ianadatabase_path=args.iana_database
    model_data_path = args.model_data
    outputfile_path=args.output
    
    url = args.url

    languages = ["de", "en","es","fr","general","it","nl","sv","uk"]

    article_extraction = ArticleExtraction(geodatabase_path,ianadatabase_path,wfbdatabase_path,model_data_path,languages)
    
    language,title = article_extraction.parse_url(url)

    collected_features = article_extraction.collect_features(url)
    #collected_features_with_prediction = article_extraction.add_predictions(language,collected_features)
    #json_writer.write_json_file(collected_features_with_prediction, outputfile_path+"/"+language+"-"+title+".json")
    json_writer.write_json_file(collected_features, outputfile_path+"/"+language+"-"+title+".json")

                # It's safe to call clear() here because no descendants will be
                # accessed
                elem.clear()
                # Also eliminate now-empty references from the root node to elem
                for ancestor in elem.xpath('ancestor-or-self::*'):
                    while ancestor.getprevious() is not None:
                        del ancestor.getparent()[0]                         
        
            del context
        
        print "number of articles: " + str(article_count)
        print "number of undetected articles: " + str(article_not_detected_count)

if __name__ == '__main__':
    # generate help text for arguments
    parser = argparse.ArgumentParser(description='Extracts languages from a list of wikipedia articles given in the xml dump format.')
    parser.add_argument('input',
                       help='a file path to bz2 compressed XML dump input')
    parser.add_argument("--output", dest="output", metavar='output path', type=str, required=True)
    
    args = parser.parse_args()
    
    inputfile_path = args.input
    outputfile_path = args.output
    
    print "running wikipedia_language_extraction"
    
    wikipedia_language_extraction = WikipediaLanguageExtraction()
    wikipedia_language_dictionary = wikipedia_language_extraction.get_wikipedia_languages()
    json_writer.write_json_file(wikipedia_language_dictionary, outputfile_path)
        return url_language_dictionary

    def timeout_handler(self,signum, frame):   # Custom signal handler
        raise TimeoutException
    
    # Change the behavior of SIGALRM
    signal.signal(signal.SIGALRM, timeout_handler) 

if __name__ == '__main__':
    # generate help text for arguments
    parser = argparse.ArgumentParser(description='Extracts languages from the content of URLs given in a JSON file that contains Wikipedia articles and their referenced URLs')
    parser.add_argument('input',
                       help='a file path to the JSON input file')
    parser.add_argument("--output", dest="output", metavar='output path', type=str, required=True)
    args = parser.parse_args()
    
    inputfile_path=args.input
    outputfile_path=args.output
    
    print "running website_language_extraction"

    # load json input
    with open(inputfile_path) as json_input:    
        json_data = json.load(json_input)
    
    website_language_extraction = WebsiteLanguageExtraction()
    url_language_dictionary = website_language_extraction.get_website_languages(json_data)
    json_writer.write_json_file(url_language_dictionary, outputfile_path)
   
Exemple #8
0
                        required=True)
    parser.add_argument("--output",
                        dest="output",
                        help='output folder',
                        type=str,
                        required=True)
    args = parser.parse_args()

    geodatabase_path = args.geodatabase
    wfbdatabase_path = args.world_fact_book_database
    ianadatabase_path = args.iana_database
    model_data_path = args.model_data
    outputfile_path = args.output

    url = args.url

    languages = ["de", "en", "es", "fr", "general", "it", "nl", "sv", "uk"]

    article_extraction = ArticleExtraction(geodatabase_path, ianadatabase_path,
                                           wfbdatabase_path, model_data_path,
                                           languages)

    language, title = article_extraction.parse_url(url)

    collected_features = article_extraction.collect_features(url)
    #collected_features_with_prediction = article_extraction.add_predictions(language,collected_features)
    #json_writer.write_json_file(collected_features_with_prediction, outputfile_path+"/"+language+"-"+title+".json")
    json_writer.write_json_file(
        collected_features,
        outputfile_path + "/" + language + "-" + title + ".json")
        # generate new article
        collected_features = article_extraction.collect_features(article_url)
        collected_features_with_prediction = article_extraction.add_predictions(language,collected_features)
        collected_features_with_fixed_outliers = article_extraction.fix_outliers(collected_features_with_prediction,"classification","classification-fixed",features)
        collected_features_with_fixed_outliers = article_extraction.fix_outliers(collected_features_with_fixed_outliers,"classification-general","classification-general-fixed",features)
        collected_features_array = article_extraction.get_as_array(collected_features_with_fixed_outliers)

        if len(collected_features_array) > 0:

            # generate directories if they don't exist
            if not os.path.exists(article_path):
                os.makedirs(article_path)
            if not os.path.exists(language_path):
                os.makedirs(language_path)

            json_writer.write_json_file(collected_features_array, article_analysis_path)

            count_features = ["ip-location","tld-location","website-language","classification-fixed","classification-general-fixed"]
            for count_feature in count_features:
                classification_general_counts = count_generation.generate_counts(collected_features_array, count_feature)
                classification_general_counts_array = count_generation.get_as_array(classification_general_counts, 20)

                article_count_path = os.path.join(article_path,"counts-"+count_feature+"-top-20.json")
                json_writer.write_json_file(classification_general_counts_array, article_count_path)

            # generate map data
            map_data = map_data_generation.generate_map_data_array(collected_features_array,"classification-general-fixed")
            article_map_data_path = os.path.join(article_path,"map-data.json")
            json_writer.write_json_file(map_data, article_map_data_path)

            # get execution date
        print "number of articles: " + str(article_count)
        print "number of undetected articles: " + str(
            article_not_detected_count)


if __name__ == '__main__':
    # generate help text for arguments
    parser = argparse.ArgumentParser(
        description=
        'Extracts languages from a list of wikipedia articles given in the xml dump format.'
    )
    parser.add_argument('input',
                        help='a file path to bz2 compressed XML dump input')
    parser.add_argument("--output",
                        dest="output",
                        metavar='output path',
                        type=str,
                        required=True)

    args = parser.parse_args()

    inputfile_path = args.input
    outputfile_path = args.output

    print "running wikipedia_language_extraction"

    wikipedia_language_extraction = WikipediaLanguageExtraction()
    wikipedia_language_dictionary = wikipedia_language_extraction.get_wikipedia_languages(
    )
    json_writer.write_json_file(wikipedia_language_dictionary, outputfile_path)
Exemple #11
0
    signal.signal(signal.SIGALRM, timeout_handler)


if __name__ == '__main__':
    # generate help text for arguments
    parser = argparse.ArgumentParser(
        description=
        'Extracts languages from the content of URLs given in a JSON file that contains Wikipedia articles and their referenced URLs'
    )
    parser.add_argument('input', help='a file path to the JSON input file')
    parser.add_argument("--output",
                        dest="output",
                        metavar='output path',
                        type=str,
                        required=True)
    args = parser.parse_args()

    inputfile_path = args.input
    outputfile_path = args.output

    print "running website_language_extraction"

    # load json input
    with open(inputfile_path) as json_input:
        json_data = json.load(json_input)

    website_language_extraction = WebsiteLanguageExtraction()
    url_language_dictionary = website_language_extraction.get_website_languages(
        json_data)
    json_writer.write_json_file(url_language_dictionary, outputfile_path)