def load_unigrams(my_index):

    punct = punctuation + '«»—…“”*–'
    russian = "[А-Яа-я]+"
    tags = ("_ADJ", "_ADP", "_ADV", "_CONJ", "_NOUN", "_NUM", "_PRT", "_VERB",
            "_X")

    fname, url, records = next(
        readline_google_store(ngram_len=1, lang='rus', indices=my_index))
    record = next(records)
    count = 0

    with open('unigrams_' + my_index + '.tsv', 'w') as f:
        writer = csv.writer(f, delimiter='\t')
        while True:
            try:
                if record.year < 1918:
                    record = next(records)
                else:
                    if len(record.ngram.strip(punct)) > 2 \
                            and re.search(russian, record.ngram) \
                            and not record.ngram.endswith(tags):
                        writer.writerow([
                            record.ngram, record.year, record.match_count,
                            record.volume_count
                        ])
                        count += 1
                        record = next(records)
                    else:
                        record = next(records)
            except StopIteration:
                break

    print(str(count) + " " + my_index + " ngrams saved")
    return 0
Exemple #2
0
 def getNgrams(self):
     """Get Frequency of Words in Google Ngram Corpus."""
     keys = self.Ngrams.keys()
     alphabet = list(map(chr, range(97, 123)))
     count = 0
     current = ''
     for char in alphabet:
         googleGen = readline_google_store(ngram_len=1, indices=char)
         while googleGen:
             try:
                 name, url, wordGen = next(googleGen)
                 while wordGen:
                     try:
                         token, year, match, volume = next(wordGen)
                         if token in keys:
                             if token == current:
                                 count += match
                             else:
                                 self.Ngrams[current] = count
                                 current = token
                                 count = 0
                         else:
                             continue
                     except StopIteration:
                         break
             except StopIteration:
                 break
         print("Finished with" + "\t" + char + "\n")
     print("Ngram Counts Completed!")
Exemple #3
0
def find_google_ngrams_word_count(word, time_function=False, verbose=False):
    if time_function == True:
        time1 = time.time()

    count = 2 # Set this to a minimum of 2 so we don't get a divide by zero error
    # TODO: Consider how we want to deal with capitalization
    fname, url, records = next(readline_google_store(ngram_len=1, indices=word[0]))
    # If we use the verbose settings, occaisionally print out the record
    verbosity_count = 1000000000
    earliest_year = 1950
    i = 0
    try:
        record = next(records)
        while record.ngram != word:
            record = next(records)
            if verbose == True and i%verbosity_count == 0:
                print(record)
            i += 1
        while record.ngram == word:
            if record.year >= earliest_year:
                count += record.match_count
                if verbose == True:
                    print(record)
            record = next(records)
    except StopIteration:
        pass
    # Default to 1 so our program doesn't crash
    if count == 0:
        count = 1
    if time_function == True:
        time2 = time.time()
    print('Total seconds for ' + word + ': ' + str(int((time2-time1))))
    return count
def load_ngrams(my_ngram_len, indx):

    russian = "[А-Яа-я]+"
    tags = "ADJ|ADP|ADV|CONJ|NOUN|NUM|PRT|VERB"
    
    fname, url, records = next(readline_google_store(ngram_len=my_ngram_len, lang='rus', indices=[indx]))
    record = next(records)
    count = 0

    with open(str(my_ngram_len) + 'grams-' + indx + '.tsv', 'w') as f:
        writer = csv.writer(f, delimiter='\t')
        while True:
            try:
                if record.year < 1918:
                    record = next(records)
                else:
                    if len(record.ngram) > 5 \
                    and re.search(russian, record.ngram) \
                    and not re.search(tags, record.ngram):
                        writer.writerow([record.ngram,
                                        record.year,
                                        record.match_count,
                                        record.volume_count])
                        count += 1
                        record = next(records)
                    else:
                        record = next(records)
            except StopIteration:
                break

    print(str(count) + " " + indx + " " + str(my_ngram_len) + "grams saved")
    return 0
Exemple #5
0
def count_ngrams(phrase, length, lang):
    """The raw data for unigrams has been downloaded locally, but not for bigrams or trigrams."""
    count = 0
    chinese_character_to_sound = {
        u'\u5341': 's',
        u'\u4e8c': 'e',
        u'\u4e09': 's',
        u'\u56db': 's',
        u'\u4e94': 'w',
        u'\u516d': 'l',
        u'\u4e03': 'q',
        u'\u516b': 'b',
        u'\u5e5d': 'j'
    }
    ngram_downloader_langcode = {
        "english": "eng",
        "chinese": "chi-sim",
        "french": "fre",
        "german": "ger",
        "hebrew": "heb",
        "italian": "ita",
        "russian": "rus",
        "spanish": "spa"
    }

    if lang == "chinese":
        index = chinese_character_to_sound[phrase[0].lower()]
    else:
        index = phrase[0].lower()

    all_combinations = get_combo(index, length)
    print(all_combinations)

    fname, url, records = next(
        readline_google_store(ngram_len=length,
                              lang=ngram_downloader_langcode[lang],
                              indices=all_combinations))

    try:
        record = next(records)
        print(record.ngram)
        while record.ngram != phrase:
            record = next(records)
            print(record.ngram)

        while record.ngram == phrase:
            count += record.match_count
            record = next(records)
            print(record.ngram)

    except StopIteration:
        pass

    return count
def test_google_ngram_download():
    from google_ngram_downloader import readline_google_store
    fname, url, records = next(readline_google_store(ngram_len=5))
    debug('fname = ' + fname)
    debug('url = ' + url)

    record = next(records)
    #debug('next: ' + str(record))
    #debug('next gram: ' + str(record.ngram.encode('utf-8')))
    while record:
        ngram = record.ngram.encode('utf-8')
        if 'American' in ngram:
            debug('gram: ' + str(record))
        record = next(records)
Exemple #7
0
def processngrams(index):
    length = 3
    try:

        ngram_dict = {}
        try:
            name, url, records = next(
                readline_google_store(ngram_len=length, indices=[index]))
        except:
            print('url not found')
            pass
        for record in records:

            if record.ngram in ngram_dict.keys():
                ngram_dict[record.ngram] = ngram_dict[
                    record.ngram] + record.match_count
            else:
                ngram_dict[record.ngram] = record.match_count
        ngram_count = {}

        for key, value in ngram_dict.items():
            new_key = []
            for text in key.split():
                new_key.append(text.split('_')[0])
            new_key = ' '.join(new_key)
            if new_key in ngram_count.keys():
                ngram_count[new_key] = ngram_count[new_key] + value
            else:
                ngram_count[new_key] = value

            filename = str(length) + '_' + index
            filepath = filename + '.json'
        with open(filepath, 'w') as fp:
            json.dump(ngram_count, fp)
        print(name)

        s3 = boto3.client('s3',
                          region_name='ap-south-1',
                          aws_access_key_id=aws_access_key_id,
                          aws_secret_access_key=aws_secret_access_key)
        bucket = 'ei-marketingdata'
        s3_file = 'parentReport_test/{}'.format(filepath)
        s3.upload_file(filepath, bucket, s3_file)

    except Exception as e:
        print(e)
__author__ = 'pv'

from google_ngram_downloader import readline_google_store
from itertools import product
from string import ascii_lowercase, digits
import codecs


letter_indices = ((''.join(i) for i in product(ascii_lowercase, ascii_lowercase + '_')))
letter_indices = (l for l in letter_indices if l != 'qk')

fs = []

try:
    for year in range(1850, 2010):
        fs.append(codecs.open('google-ngrams/' + str(year), 'w', "utf-8"))
except:
    print("couldnt open files.", year)

i = 0
for fname, url, records in readline_google_store(ngram_len=5, lang='eng-fiction', indices=letter_indices):
    print (fname)
    for record in records:
        if 1850 <= record.year <= 2010:
            out = fs[record.year - 1850]
            out_str = record.ngram + "\t" + str(record.match_count) + "\n"
            out.write(out_str)

for f in fs:
    f.close()
Exemple #9
0
from google_ngram_downloader import readline_google_store
# https://github.com/dimazest/google-ngram-downloader

files = readline_google_store(ngram_len=5)

f_bundle = next(files, None)

sink = open("output.txt", "w")

while f_bundle is not None:
    fname, url, records = f_bundle
    print(fname)

    r = next(records, None)
    text = ""
    count = 0
    while r is not None:
        cText = r.ngram
        cCount = r.match_count
        cYear = r.year

        if cText != text:
            if count > 0:
                sink.write("{}\t{}\n".format(text, count))
            count = 0
            text = cText
        
        if cYear < 1980:
            # print(cText, cCount, cYear)
            count += cCount
        r = next(records, None)
import csv
import os
import string
from ke_root import ROOT_OUTPUT
from google_ngram_downloader import readline_google_store

list_word = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'u','v', 'w', 'x', 'y', 'z', 'ch']
dict_ngram = {}
for word in list_word:
    fnames, urls, records = next(readline_google_store(ngram_len=1, indices=word, lang='spa'))
    for i in records.__iter__():
        ngram = str(i.ngram).lower()
        if ngram.find('_') == -1:
            if ngram in dict_ngram:
                temp = dict_ngram.get(ngram)
                freq = temp['freq'] + i.match_count
                count = temp['count'] + 1
                dict_ngram[ngram] = {'freq': freq,'count': count}
            else:
                freq = i.match_count
                count = 1
                dict_ngram[ngram] = {'freq':freq,'count': count}
                print('Calulated valued to ngram = {0}'.format(ngram))

result = {}
for k, v in dict_ngram.items():
    relative_freq = round(float(v['freq'] / v['count']), 2)
    result[k] = relative_freq
    print('ngrams = {0}, relative_freq = {1}'.format(k, relative_freq))

ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
import os
import csv
import string
from google_ngram_downloader import readline_google_store

list_word = [
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'p',
    'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ch'
]
dict_ngram = {}
for word in list_word:
    fnames, urls, records = next(
        readline_google_store(ngram_len=1, indices=word, lang='spa'))
    for i in records.__iter__():
        ngram = str(i.ngram).lower()
        if ngram.find('_') == -1:
            if ngram in dict_ngram:
                temp = dict_ngram.get(ngram)
                freq = temp['freq'] + i.match_count
                count = temp['count'] + 1
                dict_ngram[ngram] = {'freq': freq, 'count': count}
            else:
                freq = i.match_count
                count = 1
                dict_ngram[ngram] = {'freq': freq, 'count': count}
                print('Calulated valued to ngram = {0}'.format(ngram))

result = {}
for k, v in dict_ngram.items():
    relative_freq = round(float(v['freq'] / v['count']), 2)
    result[k] = relative_freq
Exemple #12
0
import collections
from google_ngram_downloader import readline_google_store
"""
Script for fetching and aggregating bigram data provided by Google:
	http://storage.googleapis.com/books/ngrams/books/datasetsv2.html
Requires google_ngram_downloader library: 
	https://pypi.python.org/pypi/google-ngram-downloader
Usage: python count-google-bigrams.py
"""

if __name__ == "__main__":

    reload(sys)
    sys.setdefaultencoding('utf8')

    chunks = readline_google_store(ngram_len=2, lang='eng')

    for fileName, url, records in chunks:
        if fileName[-14:] == 'punctuation.gz': break
        print "Processing " + fileName + "..."
        counts = collections.defaultdict(int)
        for r in records:
            bigram = r.ngram
            # Ignore if containing part of speech tag or comma (later used as delimiter)
            if '_' not in bigram and ',' not in bigram:
                # Set to lowercase and split at space
                [i, j] = bigram.lower().split()
                counts[(i, j)] += r.match_count

        # Write counts to file per chunk
        output = codecs.open(fileName[:-3] + "-aggregated.txt", "w", "utf-8")
Exemple #13
0
list_not = [
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '_ADJ_', '_ADP_',
    '_ADV_', '_CONJ_', '_DET_', '_NOUN_', '_NUM_', '_PRON_', '_PRT_', '_VERB_'
]
ngrams = 3
result = {}
list_indices = util.get_indices(ngrams)
dict_ngram = {}
for item in list_indices:
    if not (item in list_not):
        list_tmp = []
        list_tmp.append(item)
        try:
            fnames, urls, records = next(
                readline_google_store(ngram_len=ngrams,
                                      indices=list_tmp,
                                      lang='spa'))
            for i in records:
                try:
                    ngram = str(i.ngram).lower()
                    # print(i)
                    if ngram.find('_') == -1:
                        if ngram in dict_ngram:
                            temp = dict_ngram.get(ngram)
                            freq = float(temp['freq'] + i.match_count)
                            count = temp['count'] + 1
                            dict_ngram[ngram] = {'freq': freq, 'count': count}
                        else:
                            freq = 1 if str(i.match_count) == '' else float(
                                i.match_count)
                            count = 1
Exemple #14
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-

from google_ngram_downloader import readline_google_store
fname, url, records = next(readline_google_store(ngram_len=5))

_debug = True
#_debug = False

def debug(*values):
  if _debug:
    log('debug', *values)

def info(*values):
  log('info', *values)

def log(level, *values):
  message = '[%s]' % level
  for value in values:
    message += '\t%s' % str(value)
  print message

test_path = 'test_data/SystemOut.log'
sw_list = []

def load_stop_words():
    resource_list = ['resources/chinese_stopWord.txt', 'resources/english_stopWord.txt',
                'resources/sign_stopWord.txt', 'resources/union_stopWord.txt']
    # resource_list = ['resources/english_stopWord.txt']
    for res in resource_list:
        f = open(res)
Exemple #15
0
from google_ngram_downloader import readline_google_store

fname, url, records = next(readline_google_store(ngram_len=1, indices='.'))
for x in range(0, 5):
    print(next(records))
import json

from kafka import SimpleProducer, KafkaClient
from google_ngram_downloader import readline_google_store

# To send messages synchronously
kafka = KafkaClient("ec2-52-35-7-236.us-west-2.compute.amazonaws.com:9092")
producer = SimpleProducer(kafka)

gene = readline_google_store(ngram_len=1, lang="eng")
while True:
    try:
        fname, url, records = next(gene)
        print url
    except StopIteration:
        print "END"
        break
Exemple #17
0
from google_ngram_downloader import readline_google_store
import wget
import hadoopy
import os

for i in range(3, 6):

	gene = readline_google_store(ngram_len=i, lang='eng')

	while True:
		try:
			fname, url, records = next(gene)
			print fname
			if hadoopy.exists('/google-ngram/'+str(i)+'/'+fname):
				continue
			else:
				wget.download(url)
				hadoopy.put(fname, '/google-ngram/'+str(i)+'/'+fname)
				os.remove(fname)
		
		except StopIteration:
			print "END"
			break
Exemple #18
0
from google_ngram_downloader import readline_google_store
fname, url, records = next(readline_google_store(ngram_len=2))
import pymongo
from pymongo import MongoClient
client = MongoClient()
db = client['ngrams']
import re

def getEntry(d):
	ngram,year,match_count,volume_count=d[0:4]
	entry={
		   'ngram':ngram,
		   'year':year,
		   'match_count':match_count,
		   'volume_count':volume_count
		   }
	return entry

inspect=[]
counter=0
previous=""
keep=0
target=[u'conspir',u'scheme',u'stratagem',u'machination',u'cabal',u'deception',u'deceit',
		u'deceive', u'ploy', u'ruse',u'dodge', u'subterfuge', u'complot',u'colluder', u'collusion',
		 u'collaborator', u'conniver', u'machinator', u'traitor',u'connive']
#started=False
for fname, url, records in readline_google_store(ngram_len=5,verbose=True):
	words_re = re.compile(r"|\b".join(target))
	print fname
	#if 'ad.gz' in str(fname):
	#	started=True
import sys
import re

from google_ngram_downloader import readline_google_store

if __name__ == "__main__":
    n = sys.argv[1]
    position = sys.argv[2]
    with open(f'{position}-{n}-grams.txt', 'w') as f:
        for fname, url, records in readline_google_store(ngram_len=int(n)):
            if re.search(r'[b-z][a-z]\.gz$', fname):
                print(fname)
                counter = 0
                for r in records:
                    if counter % 1000000 == 0:
                        print(r.ngram)
                    if position == 'tailing':
                        if re.search(
                                r'^[a-zA-Z]',
                                r.ngram) and r.ngram.endswith("._. _END_"):
                            f.write('{}\n'.format(r.ngram))
                    if position == "inner":
                        if all([
                                " ._. " in r.ngram,
                                " ._. ]" not in r.ngram,
                                " ._. /" not in r.ngram,
                                " ._. *" not in r.ngram,
                                not r.ngram.startswith("._."),
                                not r.ngram.endswith("_END_"),
                                not r.ngram.endswith("_."),
                        ]):
import nltk
import re
import string


# In this step, we load the `file name`,`url` of the ngram, and `record`. Records can be understood as the rows in
# the ngram files.
# 
# `lang = 'chi-sim'` means just load the Chinese-Simplified.
# 
# ** Remember you need to be connected to internet for this program to work **
# 

# In[9]:

fname, url, records = next(readline_google_store(ngram_len = 4, lang = 'chi-sim'))


# You can look at the `url` and use it to download the ngram file, if you want! (NOT RECOMMENDED :))

# First, I defined an empty `dictionary` called `total` to store the records and their word counts.
# 
# Next line, I set the `notEoF` variable to be `True`. This variable is supposed to be the flag for when we reach the end of the google ngram records.
# 
# Next, I used the try/except structure that is used for error handling in Python. It reads the next record from google ngram. After the last record is read by the script, there will be an error. This rise of an error moves the program to the `except` section. In the `except` section, the `notEoF` becomes `False` and that stops the `while` loop in the `try` section.
# 
# 

# In[10]:

total = {}
Exemple #21
0
with open(sys.argv[1]) as f:
    needed_ngrams = f.read().splitlines()

needed_ngrams.sort()

n=int(sys.argv[2])
#make a set of all indices that need to be downloaded
needed_indices = set([x[:min(n,2)].lower() for x in needed_ngrams])
#print "Needed indices:",needed_indices

#create a map with each needed ngram a key associated with a 0 value
ngram_counts_dict = {}
for needed_ngram in needed_ngrams:
	ngram_counts_dict[needed_ngram] = 0

#print ngram_counts_dict

#for each indices, iterate over all entries for the index (lines aren't sorted)
for index in needed_indices:
	fname, url, records = next(readline_google_store(ngram_len=n, indices=(index if n ==     1 else [index])))
	for record in records:
	#add counts for matching terms
		record_ngram = remove_tags(record.ngram)
		if record_ngram in ngram_counts_dict:
			ngram_counts_dict[record_ngram] += record.match_count
 
#print counts
for ngram,ngram_counts in ngram_counts_dict.iteritems():
	print ngram, ngram_counts