def main(): # Get words to remove and to replace print("Loading Words to Remove...") commonWordsText = getFileContent("5000_Most_Common_Words.txt", "r") removeList = getCommonWords(commonWordsText) stopWords = stopwords.getStopWords() cet4List = getCET4() lemmaText = getFileContent("BNC_lemmafile5.txt", "r") lemmaList = getLemmaList(lemmaText) # Get words text and clean it print("Loading Words List...") myString = getFileContent("ken.txt", "r") myString = toLowerCase(myString) myString = replaceNonLetters(myString) print("Lemmalizing...") myString = lemmatize(myString, lemmaList) print("Removing stopwords...") myString = removeCommonWords(myString, removeList) myString = removeCommonWords(myString, cet4List) myString = removeCommonWords(myString, stopWords) print("Counting Frequency...") myWordList = myString.split(" ") myWordList = [w for w in myWordList if not len(w) < 3] myWordFreq = collections.Counter(myWordList) # Save result print("Save to File...") with open("myResult10-3.txt", "a", encoding='utf-8') as f: for key, value in myWordFreq.items(): # Get rid of which frequency less than one if value > 1: f.write(key + "\t" + str(value) + "\n") print("Success...")
#!/usr/bin/env python import urllib, requests from bottle import route, run, template, request, static_file from triggers_json import dic from elasticsearch import Elasticsearch import stopwords stopwords = stopwords.getStopWords() trigger_list = dic.keys() print trigger_list es = Elasticsearch() @route('/', method = "GET") def home(name = None): return template('template/index.html',name=request.environ.get('REMOTE_ADDR')) @route('/<query>', method="GET") def index(query=""): filtered_query = [i for i in query.split() if i not in stopwords] ##filtering out stopwpords print query template = [dic[x] for x in query if x in dic.keys()] print template query={ "size":1, "query": {
from ConfigParser import SafeConfigParser ROWS_PER_QUERY = 500000 # Load config data parser = SafeConfigParser() parser.read('config.txt') MY_API_KEY = parser.get('API','MY_API_KEY') mc = mediacloud.api.AdminMediaCloud(MY_API_KEY) #AdminMediaCloud, rather than MediaCloud logging.basicConfig(level=logging.DEBUG) logging.info("-----------------------------------------------------------------") logging.info("Starting QB data gathering") # build stopwords my_stopwords = [word.lower() for word in stopwords.getStopWords()] qb_table = csv.reader(codecs.open('qb-table.csv', 'r', 'utf-8')) qb_table.next() team_stopwords = [] qb_stopwords = [] for row in qb_table: [ team_stopwords.append(word.lower()) for word in row[0].split() ] [ qb_stopwords.append(word.lower()) for word in row[1].split() ] logging.debug(" Added qb names to stopwords: %s" % qb_stopwords) logging.debug(" Added team names to stopwords: %s" % team_stopwords) my_stopwords = my_stopwords + qb_stopwords + team_stopwords # load media sources m = codecs.open('sources.csv','r','utf-8') media_reader = csv.reader(m) media = [x[1] for x in media_reader][1:]