Exemple #1
0
def make_worldwide_question_table(header):
    global worldwide
    worldwide_question_table = collections.OrderedDict()
    dictionaries.append(worldwide_question_table)

    question_table_start = 0
    if worldwide_q:
        header["worldwide_question_offset"] = offset_count()
        if file_type == "v":
            question_table_start = len(
                country_language[country_code]) * national

    if file_type == "v":
        question_table_count = len(country_language[country_code])
    elif file_type == "q":
        question_table_count = 9

    for q in question_keys:
        if is_worldwide(q):
            worldwide_question_table["poll_id_%s" % num()] = u32(q)
            worldwide_question_table["poll_category_1_%s" % num()] = u8(
                get_category(q))
            worldwide_question_table["poll_category_2_%s" % num()] = u8(
                categories[get_category(q)])
            worldwide_question_table["opening_timestamp_%s" % num()] = u32(
                get_timestamp(1, "w", get_date(q)))
            worldwide_question_table["closing_timestamp_%s" % num()] = u32(
                get_timestamp(2, "w", get_date(q)))
            worldwide_question_table["question_table_count_%s" %
                                     num()] = u8(question_table_count)
            worldwide_question_table["question_table_start_%s" %
                                     num()] = u32(question_table_start)
            question_table_count += 1

    return worldwide_question_table
Exemple #2
0
def make_national_question_table(header):
    global national
    national_question_table = collections.OrderedDict()
    dictionaries.append(national_question_table)

    question_table_count = 0
    if national_q:
        header["national_question_offset"] = offset_count()

    for q in question_keys:
        if not is_worldwide(q):
            national_question_table["poll_id_%s" % num()] = u32(q)
            national_question_table["poll_category_1_%s" % num()] = u8(
                get_category(q))
            national_question_table["poll_category_2_%s" % num()] = u8(
                categories[get_category(q)])
            national_question_table["opening_timestamp_%s" % num()] = u32(
                get_timestamp(1, "n", get_date(q)))
            national_question_table["closing_timestamp_%s" % num()] = u32(
                get_timestamp(2, "n", get_date(q)))
            national_question_table["question_table_count_%s" % num()] = u8(
                len(country_language[country_code]))
            national_question_table["question_table_start_%s" %
                                    num()] = u32(question_table_count)
            question_table_count += len(country_language[country_code])

    return national_question_table
Exemple #3
0
def make_national_result_table(header):
    table = collections.OrderedDict()
    dictionaries.append(table)

    national_result_detailed_number_count = 0
    national_result_detailed_number_tables = region_number[country_code]
    header["national_result_offset"] = offset_count()

    for i in results:
        if results[i][8] == "n":
            country_index = country_codes.index(country_code)

            table["poll_id_%s" % num()] = u32(i)
            table["male_voters_response_1_num_%s" % num()] = u32(
                results[i][0][country_index])
            table["male_voters_response_2_num_%s" % num()] = u32(
                results[i][2][country_index])
            table["female_voters_response_1_num_%s" % num()] = u32(
                results[i][1][country_index])
            table["female_voters_response_2_num_%s" % num()] = u32(
                results[i][3][country_index])
            table["predictors_response_1_num_%s" % num()] = u32(
                results[i][4][country_index])
            table["predictors_response_2_num_%s" % num()] = u32(
                results[i][5][country_index])
            table["show_voter_number_flag_%s" % num()] = u8(1)
            table["detailed_results_flag_%s" % num()] = u8(1)
            table["national_result_detailed_number_number_%s" %
                  num()] = u8(national_result_detailed_number_tables)
            table["starting_national_result_detailed_number_table_number_%s" %
                  num()] = u32(national_result_detailed_number_count)
            national_result_detailed_number_count += national_result_detailed_number_tables

    return table
Exemple #4
0
def make_source_pictures(source_table, data):
    source_pictures = {}
    dictionaries.append(source_pictures)

    source_articles = []

    sources = [
        "ANP",
        "AP",
        "dpa",
        "Reuters",
        "SID",
        "NU.nl",
    ]  # these are the news sources which will use a custom JPG for the logo

    for article in list(data.values()):
        if article[8] not in source_articles:
            if article[8] in sources:
                source_articles.append(article[8])

                source_table["pictures_offset_%s" % article[8]] = offset_count()

                with open(
                    "./Channels/News_Channel/logos/%s.jpg" % article[8], "rb"
                ) as source_file:
                    image = source_pictures["logo_%s" % article[8]] = source_file.read()
                    source_table["pictures_size_%s" % article[8]] = u32(len(image))

                if source_table["source_picture_%s" % article[8]] != u8(0):
                    source_table["source_picture_%s" % article[8]] = u8(0)

    return source_pictures
Exemple #5
0
def make_source_table(header, articles_table, source, data):
    source_table = {}
    dictionaries.append(source_table)

    header["source_offset"] = offset_count()  # Offset for the source table.

    source_articles = []

    numbers = 0

    numbers_article = 0

    for article in list(data.values()):
        if article[8] not in source_articles:
            source_articles.append(article[8])

            source_table["source_picture_%s" % article[8]] = u8(
                source["picture"]
            )  # Picture for the source.
            source_table["source_position_%s" % article[8]] = u8(
                source["position"]
            )  # Position for the source.
            source_table["padding_%s" % article[8]] = u16(0)  # Padding.

            source_table["pictures_size_%s" % article[8]] = u32(
                0
            )  # Size of the source picture.
            source_table["pictures_offset_%s" % article[8]] = u32(
                0
            )  # Offset for the source picture.

            source_table["name_size_%s" % article[8]] = u32(
                0
            )  # Size of the source name.
            source_table["name_offset_%s" % article[8]] = u32(
                0
            )  # Offset for the source name.

            source_table["copyright_size_%s" % article[8]] = u32(
                0
            )  # Size of the copyright.
            source_table["copyright_offset_%s" % article[8]] = u32(
                0
            )  # Offset for the copyright.

            numbers += 1

    for article in list(data.values()):
        numbers_article += 1

        articles_table["source_%s_number" % numbers_article] = u32(
            source_articles.index(article[8])
        )  # Number for the source.

    header["source_number"] = u32(numbers)  # Number of entries for the source table.

    return source_table
Exemple #6
0
def make_header():
    header = collections.OrderedDict()
    dictionaries.append(header)

    header["timestamp"] = u32(get_timestamp(0, None, None))
    header["country_code"] = u8(country_code)
    header["publicity_flag"] = u8(0)
    header["question_version"] = u8(0 if file_type == "r" else 1)
    header["result_version"] = u8(1 if file_type == "r" else 0)
    header["national_question_number"] = u8(national)
    header["national_question_offset"] = u32(0)
    header["worldwide_question_number"] = u8(worldwide)
    header["worldwide_question_offset"] = u32(0)
    header["question_number"] = u8(questions * len(country_language[country_code]))
    header["question_offset"] = u32(0)
    header["national_result_entry"] = u8(national_results)
    header["national_result_offset"] = u32(0)
    header["national_result_detailed_number"] = u16(national_results * region_number[country_code])
    header["national_result_detailed_offset"] = u32(0)
    header["position_number"] = u16(0 if file_type == "q" or national_results == 0 else 22 if country_code == 77 else len(position_table[country_code]) if country_code in position_table.keys() else 0)
    header["position_offset"] = u32(0)
    header["worldwide_result_number"] = u8(worldwide_results)
    header["worldwide_result_offset"] = u32(0)
    header["worldwide_result_detailed_number"] = u16(0)
    header["worldwide_result_detailed_offset"] = u32(0)
    header["country_name_number"] = u16(len(countries) * 7 if file_type == "r" and nw == "w" else 0 if file_type == "q" or file_type == "r" else len(countries) * 7)
    header["country_name_offset"] = u32(0)

    return header
Exemple #7
0
def make_worldwide_result_table(header):
    table = collections.OrderedDict()
    dictionaries.append(table)

    worldwide_detailed_table_count_all = 0
    header["worldwide_result_offset"] = offset_count()

    for i in results:
        if results[i][8] == "w":
            worldwide_detailed_table_count = 0
            for j in range(len(countries)):  # 33
                total = 0
                for voters in range(0, 4):
                    total += results[i][voters][j]
                if total > 0: worldwide_detailed_table_count += 1

            table["poll_id_%s" % num()] = u32(i)
            table["male_voters_response_1_num_%s" % num()] = u32(sum(results[i][0]))
            table["male_voters_response_2_num_%s" % num()] = u32(sum(results[i][2]))
            table["female_voters_response_1_num_%s" % num()] = u32(sum(results[i][1]))
            table["female_voters_response_2_num_%s" % num()] = u32(sum(results[i][3]))
            table["predictors_response_1_num_%s" % num()] = u32(sum(results[i][4]))
            table["predictors_response_2_num_%s" % num()] = u32(sum(results[i][5]))
            table["total_worldwide_detailed_tables_%s" % num()] = u8(worldwide_detailed_table_count)
            table["starting_worldwide_detailed_table_number_%s" % num()] = u32(worldwide_detailed_table_count_all)
            worldwide_detailed_table_count_all += worldwide_detailed_table_count

    return table
Exemple #8
0
def make_question_text_table(header):
    global questions
    question_text_table = collections.OrderedDict()
    dictionaries.append(question_text_table)

    header["question_offset"] = offset_count()

    for q in question_keys:
        if not is_worldwide(q):
            list = country_language[country_code]
        elif is_worldwide(q):
            if file_type == "v":
                list = country_language[country_code]
            elif file_type == "q":
                list = range(1, 9)
        for language_code in list:
            if get_question(q, language_code) is not None:
                num = question_keys.index(q)
                question_text_table["language_code_%s_%s" %
                                    (num, language_code)] = u8(language_code)
                question_text_table["question_offset_%s_%s" %
                                    (num, language_code)] = u32(0)
                question_text_table["response_1_offset_%s_%s" %
                                    (num, language_code)] = u32(0)
                question_text_table["response_2_offset_%s_%s" %
                                    (num, language_code)] = u32(0)

    return question_text_table
Exemple #9
0
def make_pictures(pictures_table, data):
    pictures = {}
    dictionaries.append(pictures)

    numbers = 0

    for article in list(data.values()):
        numbers += 1
        if article[4] is not None:
            if "pictures_%s_offset" % numbers in pictures_table:
                pictures_table[
                    "pictures_%s_offset" % numbers
                ] = offset_count()  # Offset for the pictures.
            pictures["pictures_%s_read" % numbers] = article[4]  # Read the pictures.
            pictures["nullbyte_%s_pictures" % numbers] = u8(
                0
            )  # Null byte for the pictures.

            for types in ["captions", "credits"]:
                if pictures_table["%s_%s_offset" % (types, numbers)] != u32(
                    0
                ) and pictures_table["%s_%s_size" % (types, numbers)] == u32(0):
                    pictures_table["%s_%s_offset" % (types, numbers)] = u32(0)

    return pictures
def make_header(data):
    header = collections.OrderedDict()
    dictionaries.append(header)

    header["updated_timestamp_1"] = get_timestamp(1)  # Updated time.
    header["term_timestamp"] = get_timestamp(2)  # Timestamp for the term.
    header["country_code"] = u32_littleendian(country_code)  # Wii Country Code.
    header["updated_timestamp_2"] = get_timestamp(1)  # 3rd timestamp.

    # List of languages that appear on the language select screen 

    numbers = 0

    for language in languages:
        numbers += 1

        header["language_select_%s" % numbers] = u8(language)

    # Fills the rest of the languages as null 

    while numbers < 16:
        numbers += 1

        header["language_select_%s" % numbers] = u8(255)

    header["language_code"] = u8(language_code)  # Wii language code.
    header["goo_flag"] = u8(0)  # Flag to make the Globe display "Powered by Goo".
    header["language_select_screen_flag"] = u8(0)  # Flag to bring up the language select screen.
    header["download_interval"] = u8(30)  # Interval in minutes to check for new articles to display on the Wii Menu.
    header["message_offset"] = u32(0)  # Offset for a message.
    header["topics_number"] = u32(0)  # Number of entries for the topics table.
    header["topics_offset"] = u32(0)  # Offset for the topics table.
    header["articles_number"] = u32(0)  # Number of entries for the articles table.
    header["articles_offset"] = u32(0)  # Offset for the articles table.
    header["source_number"] = u32(0)  # Number of entries for the source table.
    header["source_offset"] = u32(0)  # Offset for the source table.
    header["locations_number"] = u32(0)  # Number of entries for the locations.
    header["locations_offset"] = u32(0)  # Offset for the locations table.
    header["pictures_number"] = u32(0)  # Number of entries for the pictures table.
    header["pictures_offset"] = u32(0)  # Offset for the pictures table.
    header["count"] = u16(480)  # Count value.
    header["unknown"] = u16(0)  # Unknown.
    header["wiimenu_articles_number"] = u32(0)  # Number of Wii Menu article entries.
    header["wiimenu_articles_offset"] = u32(0)  # Offset for the Wii Menu article table.
    header["wiimenu_articles_offset"] = offset_count()  # Offset for the Wii Menu article table.

    numbers = 0

    headlines = []

    for article in list(data.values()):
        if numbers < 11:
            if article[3].replace(b'\n', b'') not in headlines:
                numbers += 1
                headlines.append(article[3])
                header["headline_%s_size" % numbers] = u32(0)  # Size of the headline.
                header["headline_%s_offset" % numbers] = u32(0)  # Offset for the headline.

    return header
Exemple #11
0
def make_country_name_table(header):
    global countries
    country_name_table = collections.OrderedDict()
    dictionaries.append(country_name_table)

    header["country_name_offset"] = offset_count()

    for k in countries.keys():
        num = countries.keys().index(k)
        for i in range(len(languages)):
            country_name_table["language_code_%s_%s" % (num, i)] = u8(i)
            country_name_table["text_offset_%s_%s" % (num, i)] = u32(0)

    return country_name_table
Exemple #12
0
def make_national_result_detailed_table(header):
    table = collections.OrderedDict()
    dictionaries.append(table)

    header["national_result_detailed_offset"] = offset_count()

    for i in results:
        if results[i][8] == "n":
            for j in range(region_number[country_code]):
                country_index = country_codes.index(country_code)
                table["voters_response_1_num_%s" % num()] = u32(results[i][6][country_index][j])
                table["voters_response_2_num_%s" % num()] = u32(results[i][7][country_index][j])
                table["position_entry_table_count_%s" % num()] = u8(0 if (results[i][6][country_index][j] == 0 and results[i][7][country_index][j] == 0) or (country_code not in position_table.keys()) else position_table[country_code][j])
                table["starting_position_entry_table_%s" % num()] = u32(sum(position_table[country_code][:j]) if country_code in position_table.keys() else 0)

    return table
Exemple #13
0
def locations_download(
    language_code, data
):  # using Google Maps API is so much better than the crap Nintendo used for say, AP news.
    locations = {}
    gmaps = googlemaps.Client(key=config["google_maps_api_key"])

    languages = {  # corresponds to the Wii's language codes
        0: "ja",
        1: "en",
        2: "de",
        3: "fr",
        4: "es",
        5: "it",
        6: "nl",
    }

    for keys, values in list(data.items()):
        location = values[7]

        if location and location != "":
            if location not in locations:
                locations[location] = [None, None, []]

            locations[location][2].append(keys)

    for name in list(locations.keys()):
        if name == "":
            continue

        uni_name = (
            name if languages[language_code] == "ja" else unidecode(name)
        )  # if using unidecode with Japanese, it'll translate all the characters to English

        print(uni_name)

        coordinates = None

        if name not in cities:
            try:
                read = gmaps.geocode(uni_name,
                                     language=languages[language_code])
                loc_name = read[0]["address_components"][0]["long_name"]

                if languages[language_code] == "ja":
                    loc_name = enc(loc_name)
                else:
                    loc_name = enc(unidecode(loc_name))
                """Not doing anything with these."""

                country = u8(0)
                region = u8(0)
                location = u16(0)
                zoom_factor = u32_littleendian(
                    6
                )  # Nintendo used the value of 3 for states and countries but we probably don't have any articles that are just states or countries

                coordinates = (
                    s16(
                        int(read[0]["geometry"]["location"]["lat"] /
                            (360 / 65536))) + s16(
                                int(read[0]["geometry"]["location"]["lng"] /
                                    (360 / 65536))) + country + region +
                    location + zoom_factor
                )  # latitude and longitude is divided by the value of 360 (degrees of a full circle) divided by the max int for a 16-bit int
            except Exception as e:
                ex = "There was a error downloading the location data - line {}: {}".format(
                    sys.exc_info()[-1].tb_lineno, str(e))
                print(ex)
                log(ex, "INFO")

        else:
            coordinates = binascii.unhexlify(cities[name][0] +
                                             "0000000006000000")
            loc_name = enc(cities[name][1])

        if locations[name][0] is None and coordinates is not None:
            locations[name][0] = coordinates
        else:
            del locations[name]
            continue

        if locations[name][1] is None:
            locations[name][1] = loc_name

    return locations
Exemple #14
0
    return extracted_idioms


if __name__ == '__main__':
    print 'Hello! Time is {0}'.format(config.TIME)

    # Create working directory if it doesn't exist
    if not os.path.isdir(config.WORK_DIR):
        os.mkdir(config.WORK_DIR)

    # Read in corpus as list of documents
    if config.CORPUS_TYPE == 'plain':
        documents = process_corpus.plain_text(config.CORPUS, config.NO_SPLIT)
        print 'First sentence of corpus: {0}\nLast sentence of corpus: {1}'.format(
            u8(documents[0][0]), u8(documents[-1][-1]))
    elif config.CORPUS_TYPE[0:3] == 'bnc':
        cache_path = os.path.join(
            config.WORK_DIR, '{0}_parsed_xml.json'.format(config.CORPUS_TYPE))
        documents = process_corpus.bnc(config.CORPUS, config.CORPUS_TYPE,
                                       cache_path)
        print 'First sentence of corpus: {0}\nLast sentence of corpus: {1}'.format(
            u8(documents[0][0]['sentence']), u8(documents[-1][-1]['sentence']))

    # Get idioms from dictionary
    idioms = get_idiom_list(case_sensitive=config.CASE_SENSITIVE)
    print "Found {4} idioms ranging from '{0}', '{1}' to '{2}', '{3}'".format(
        u8(idioms[0]), u8(idioms[1]), u8(idioms[-2]), u8(idioms[-1]),
        len(idioms))

    # Extract idioms
Exemple #15
0
extracted_idioms_1 = []
with open(args.extracted_1, 'r') as csvfile:
	csvreader = csv.reader(csvfile, delimiter = '\t', quoting=csv.QUOTE_MINIMAL, quotechar = '"')
	for csvrow in csvreader:
		extracted_idioms_1.append({'document_id': csvrow[4], 'sentence_number': csvrow[5], 'idiom': csvrow[0], 'context': unicode(csvrow[3], 'utf-8'), 'start': csvrow[1], 'end': csvrow[2], 'bnc_start': csvrow[6], 'bnc_end': csvrow[7]})
extracted_idioms_2 = []
with open(args.extracted_2, 'r') as csvfile:
	csvreader = csv.reader(csvfile, delimiter = '\t', quoting=csv.QUOTE_MINIMAL, quotechar = '"')
	for csvrow in csvreader:
		extracted_idioms_2.append({'document_id': csvrow[4], 'sentence_number': csvrow[5], 'idiom': csvrow[0], 'context': unicode(csvrow[3], 'utf-8'), 'start': csvrow[1], 'end': csvrow[2], 'bnc_start': csvrow[6], 'bnc_end': csvrow[7]})
	
# Combine two sets of extractions
combined_idioms = copy.deepcopy(extracted_idioms_1)
for extracted_idiom_2 in extracted_idioms_2:
	matched = False
	for extracted_idiom_1 in extracted_idioms_1:
		if extracted_idiom_2['idiom'].lower() == extracted_idiom_1['idiom'].lower() and extracted_idiom_2['document_id'] == extracted_idiom_1['document_id'] and extracted_idiom_2['sentence_number'] == extracted_idiom_1['sentence_number']:
			matched = True
			break
	if not matched:
		combined_idioms.append(extracted_idiom_2)

# Output to file	
with open(args.combined, 'w') as of:
	writer = csv.writer(of, delimiter = '\t', quoting=csv.QUOTE_MINIMAL, quotechar = '"')
	for extracted_idiom in combined_idioms:
		output_row = [u8(extracted_idiom['idiom']), extracted_idiom['start'], extracted_idiom['end'], 
			u8(extracted_idiom['context']), u8(extracted_idiom['document_id']), u8(extracted_idiom['sentence_number']), 
			extracted_idiom['bnc_start'], extracted_idiom['bnc_end']]
		writer.writerow(output_row)
Exemple #16
0
def locations_download(language_code, data):
    locations = collections.OrderedDict()
    locations_return = collections.OrderedDict()
    gmaps = googlemaps.Client(key=config["google_maps_api_key"])

    """This dictionary is used to determine languages."""

    languages = {
        0: "ja",
        1: "en",
        2: "de",
        3: "fr",
        4: "es",
        5: "it",
        6: "nl",
    }

    for keys, values in data.items():
        location = values[7]

        if location is not None:
            if location not in locations:
                locations[location] = []

            locations[location].append(keys)

    for name in locations.keys():
        read = None

        if name == "":
            continue

        uni_name = name if languages[language_code] == "ja" else unidecode(name)

        print uni_name

        if name not in cities:
            try:
                read = gmaps.geocode(uni_name, language=languages[language_code])
            except:
                log("There was a error downloading the location data.", "INFO")

        if read is None and name in cities:
            coordinates = binascii.unhexlify(cities[name][0] + "0000000006000000")
            new_name = enc(cities[name][1])

            for filenames in locations[name]:
                if new_name not in locations_return:
                    locations_return[new_name] = [coordinates, []]

                locations_return[new_name][1].append(filenames)

        elif read is not None:
            try:
                new_name = read[0]["address_components"][0]["long_name"].encode("utf-16be")

                """Not doing anything with these at this time."""

                country = u8(0)
                region = u8(0)
                location = u16(0)
                zoom_factor = u32_littleendian(6)

                coordinates = u16(int(read[0]["geometry"]["location"]["lat"] / 0.0054931640625) & 0xFFFF) + u16(int(
                    read[0]["geometry"]["location"][
                        "lng"] / 0.0054931640625) & 0xFFFF) + country + region + location + zoom_factor

                for filenames in locations[name]:
                    if new_name not in locations_return: locations_return[new_name] = [coordinates, []]

                    locations_return[new_name][1].append(filenames)
            except:
                log("There was a error downloading the location data.", "INFO")

    return locations_return
								extracted_idioms.append(extracted_idiom)
								previously_matched_indices = matched_indices

	return extracted_idioms

if __name__ == '__main__':
	print('Hello! Time is {0}'.format(config.TIME))

	# Create working directory if it doesn't exist
	if not os.path.isdir(config.WORK_DIR):
		os.mkdir(config.WORK_DIR)

	# Read in corpus as list of documents
	if config.CORPUS_TYPE == 'plain':
		documents = process_corpus.plain_text(config.CORPUS, config.NO_SPLIT)
		print('First sentence of corpus: {0}\nLast sentence of corpus: {1}'.format(u8(documents[0][0]), u8(documents[-1][-1])))
	elif config.CORPUS_TYPE[0:3] == 'bnc':
		cache_path = os.path.join(config.WORK_DIR, '{0}_parsed_xml.json'.format(config.CORPUS_TYPE))
		documents = process_corpus.bnc(config.CORPUS, config.CORPUS_TYPE, cache_path)
		print('First sentence of corpus: {0}\nLast sentence of corpus: {1}'.format(u8(documents[0][0]['sentence']), u8(documents[-1][-1]['sentence'])))

	# Get idioms from dictionary
	idioms = get_idiom_list(case_sensitive = config.CASE_SENSITIVE)
	print("Found {4} idioms ranging from '{0}', '{1}' to '{2}', '{3}'".format(u8(idioms[0]), u8(idioms[1]), u8(idioms[-2]), u8(idioms[-1]), len(idioms)))

	# Extract idioms
	extraction_start = time.time()
	if config.METHOD == 'exact':
		extracted_idioms = string_match(idioms, documents, fuzzy = False, inflect = False, case_sensitive = config.CASE_SENSITIVE)
	elif config.METHOD == 'fuzzy':
		extracted_idioms = string_match(idioms, documents, fuzzy = True, inflect = False, case_sensitive = config.CASE_SENSITIVE)
Exemple #18
0
                                extracted_idioms.append(extracted_idiom)
                                previously_matched_indices = matched_indices

    return extracted_idioms


if __name__ == '__main__':
    # Create working directory if it doesn't exist
    if not os.path.isdir(config.WORK_DIR):
        os.mkdir(config.WORK_DIR)

    # Read in corpus as list of documents
    if config.CORPUS_TYPE == 'plain':
        documents = train_dataset.plain_text(config.CORPUS, config.NO_SPLIT)
        print('First sentence of corpus: {0}\nLast sentence of corpus: {1}'.
              format(u8(documents[0][0]), u8(documents[-1][-1])))
    # Get idioms from dictionary
    idioms = get_idiom_list(case_sensitive=config.CASE_SENSITIVE)
    print("Found {4} idioms ranging from '{0}', '{1}' to '{2}', '{3}'".format(
        u8(idioms[0]), u8(idioms[1]), u8(idioms[-2]), u8(idioms[-1]),
        len(idioms)))

    # Extract idioms
    extraction_start = time.time()
    if config.METHOD == 'exact':
        extracted_idioms = string_match(idioms,
                                        documents,
                                        fuzzy=False,
                                        inflect=False,
                                        case_sensitive=config.CASE_SENSITIVE)
    elif config.METHOD == 'fuzzy':
Exemple #19
0
def locations_download(language_code, data):
    locations = collections.OrderedDict()
    gmaps = googlemaps.Client(key=config["google_maps_api_key"])
    """This dictionary is used to determine languages."""

    languages = {
        0: "ja",
        1: "en",
        2: "de",
        3: "fr",
        4: "es",
        5: "it",
        6: "nl",
    }

    for keys, values in list(data.items()):
        location = values[7]

        if location is not None:
            if location not in locations:
                locations[location] = [None, None, []]

            locations[location][2].append(keys)

    for name in list(locations.keys()):
        if name == "":
            continue

        uni_name = name if languages[language_code] == "ja" else unidecode(
            name
        )  # If using unidecode with Japanese, it'll translate all the characters to English

        print(uni_name)

        if name not in cities:
            try:
                read = gmaps.geocode(uni_name,
                                     language=languages[language_code])
                loc_name = read[0]["address_components"][0]["long_name"]

                if languages[language_code] == "ja":
                    loc_name = enc(loc_name)
                else:
                    loc_name = enc(unidecode(loc_name))
                """Not doing anything with these."""

                country = u8(0)
                region = u8(0)
                location = u16(0)
                zoom_factor = u32_littleendian(6)

                coordinates = s16(int(read[0]["geometry"]["location"]["lat"] / (360 / 65536))) + \
                                s16(int(read[0]["geometry"]["location"]["lng"] / (360 / 65536))) + \
                                country + region + location + zoom_factor
            except:
                log("There was a error downloading the location data.", "INFO")

        else:
            coordinates = binascii.unhexlify(cities[name][0] +
                                             "0000000006000000")
            loc_name = enc(cities[name][1])

        if locations[name][0] is None:
            locations[name][0] = coordinates

        if locations[name][1] is None:
            locations[name][1] = loc_name

    return locations
Exemple #20
0
def make_source_table(header, articles_table, data):
    source_table = collections.OrderedDict()
    dictionaries.append(source_table)

    header["source_offset"] = offset_count()  # Offset for the source table.

    source_articles = []
    """These are the picture and position values."""

    source_nums = {
        "AP": [0, 1],
        "Reuters": [0, 4],
        "AFP": [4, 4],
        "AFP_French": [4, 4],
        "ANP": [0, 5],
        "ANSA": [6, 6],
        "dpa": [0, 4],
        "SID": [0, 4],
        "NU.nl": [0, 5],
        "Reuters_Japanese": [0, 4],
    }

    numbers = 0

    numbers_article = 0

    for article in data.values():
        if article[8] not in source_articles:
            source_articles.append(article[8])

            source = source_nums[article[8]]

            source_table["source_picture_%s" % article[8]] = u8(
                source[0])  # Picture for the source.
            source_table["source_position_%s" % article[8]] = u8(
                source[1])  # Position for the source.
            source_table["padding_%s" % article[8]] = u16(0)  # Padding.

            source_table["pictures_size_%s" % article[8]] = u32(
                0)  # Size of the source picture.
            source_table["pictures_offset_%s" % article[8]] = u32(
                0)  # Offset for the source picture.

            source_table["name_size_%s" % article[8]] = u32(
                0)  # Size of the source name.
            source_table["name_offset_%s" % article[8]] = u32(
                0)  # Offset for the source name.

            source_table["copyright_size_%s" % article[8]] = u32(
                0)  # Size of the copyright.
            source_table["copyright_offset_%s" % article[8]] = u32(
                0)  # Offset for the copyright.

            numbers += 1

    for article in data.values():
        numbers_article += 1

        articles_table["source_%s_number" % numbers_article] = u32(
            source_articles.index(article[8]))  # Number for the source.

    header["source_number"] = u32(
        numbers)  # Number of entries for the source table.

    return source_table