Example #1
0
def main():
    # Get words to remove and to replace
    print("Loading Words to Remove...")
    commonWordsText = getFileContent("5000_Most_Common_Words.txt", "r")
    removeList = getCommonWords(commonWordsText)
    stopWords = stopwords.getStopWords()
    cet4List = getCET4()

    lemmaText = getFileContent("BNC_lemmafile5.txt", "r")
    lemmaList = getLemmaList(lemmaText)

    # Get words text and clean it
    print("Loading Words List...")
    myString = getFileContent("ken.txt", "r")
    myString = toLowerCase(myString)
    myString = replaceNonLetters(myString)

    print("Lemmalizing...")
    myString = lemmatize(myString, lemmaList)

    print("Removing stopwords...")
    myString = removeCommonWords(myString, removeList)
    myString = removeCommonWords(myString, cet4List)
    myString = removeCommonWords(myString, stopWords)

    print("Counting Frequency...")
    myWordList = myString.split(" ")
    myWordList = [w for w in myWordList if not len(w) < 3]
    myWordFreq = collections.Counter(myWordList)

    # Save result
    print("Save to File...")
    with open("myResult10-3.txt", "a", encoding='utf-8') as f:
        for key, value in myWordFreq.items():
            # Get rid of which frequency less than one
            if value > 1:
                f.write(key + "\t" + str(value) + "\n")
    print("Success...")
#!/usr/bin/env python

import urllib, requests
from bottle import route, run, template, request, static_file
from triggers_json import dic
from elasticsearch import Elasticsearch
import stopwords
stopwords = stopwords.getStopWords()
trigger_list = dic.keys()
print trigger_list

es = Elasticsearch()


@route('/', method = "GET")
def home(name = None):
	return template('template/index.html',name=request.environ.get('REMOTE_ADDR'))

@route('/<query>', method="GET")
def index(query=""):
	
	filtered_query = [i for i in query.split() if i not in stopwords]	##filtering out stopwpords
	
	print query

	template = [dic[x] for x in query if x in dic.keys()]
	print template

	query={
   "size":1,
   "query": {
from ConfigParser import SafeConfigParser

ROWS_PER_QUERY = 500000

# Load config data
parser = SafeConfigParser()
parser.read('config.txt')
MY_API_KEY = parser.get('API','MY_API_KEY')
mc = mediacloud.api.AdminMediaCloud(MY_API_KEY) #AdminMediaCloud, rather than MediaCloud

logging.basicConfig(level=logging.DEBUG)
logging.info("-----------------------------------------------------------------")
logging.info("Starting QB data gathering")

# build stopwords
my_stopwords = [word.lower() for word in stopwords.getStopWords()]
qb_table = csv.reader(codecs.open('qb-table.csv', 'r', 'utf-8'))
qb_table.next()
team_stopwords = []    
qb_stopwords = []
for row in qb_table:
    [ team_stopwords.append(word.lower()) for word in row[0].split() ]
    [ qb_stopwords.append(word.lower()) for word in row[1].split() ]
logging.debug(" Added qb names to stopwords: %s" % qb_stopwords)
logging.debug(" Added team names to stopwords: %s" % team_stopwords)
my_stopwords = my_stopwords + qb_stopwords + team_stopwords

# load media sources
m = codecs.open('sources.csv','r','utf-8')
media_reader = csv.reader(m)
media = [x[1] for x in media_reader][1:]