def printTopMost(freq, n): saved = sys.stdout sys.stdout = io.StringIO() wordfreq.printTopMost(freq, n) out = sys.stdout.getvalue() sys.stdout = saved return out
def main(): arg1 = sys.argv[1] arg2 = sys.argv[2] urlInput = False if arg2.startswith("http://") or arg2.startswith("https://"): urlInput = True inp_file1 = open(arg1) if urlInput: response = urllib.request.urlopen(arg2) inp_file2 = response.read().decode("utf8").splitlines() else: inp_file2 = open(arg2) numPrints = int(sys.argv[3]) tokenizedLines = wordfreq.tokenize(inp_file2) tokenizedStopWords = wordfreq.tokenize(inp_file1) inp_file1.close() if not urlInput: inp_file2.close() frequencies = wordfreq.countWords(tokenizedLines, tokenizedStopWords) wordfreq.printTopMost(frequencies, numPrints)
def main(): #Add all stopwords to a List stop_file = open(sys.argv[1], encoding="utf-8") stop_words = [] for stop in stop_file: stop_words.append(stop.strip()) stop_file.close() inp_file = "" #Check if file points to local dir or http if (str(sys.argv[2]).startswith('http://') or str(sys.argv[2]).startswith('https://')): response = urllib.request.urlopen(sys.argv[2]) inp_file = response.read().decode("utf8").splitlines() else: local_file = open(sys.argv[2], encoding="utf-8") inp_file = local_file.read().splitlines() local_file.close() #Split all words t_file = w.tokenize(inp_file) #Count words countDic = w.countWords(t_file, stop_words) #Print top N w.printTopMost(countDic, int(sys.argv[3]))
def main(): f1 = open(sys.argv[1], encoding="utf-8") stops = [] for line in f1: stops.append(line.strip()) f1.close() text = check(sys.argv[2]) tokenz = wordfreq.tokenize(text) freks = wordfreq.countWords(tokenz, stops) wordfreq.printTopMost(freks, int(sys.argv[3]))
def main(): # Loads the files needed for the execution stopwords = open(sys.argv[1], "r", encoding="utf8").read().splitlines() text_source = sys.argv[2] if '://' in text_source: # text-source is an internet url. Download over internet text = urllib.request.urlopen(text_source).read().decode('utf-8') else: # text-source is a local file with open(text_source, mode='r', encoding='utf-8') as f: text = f.read() # Split into a list of lines lines = text.splitlines() limit = int(sys.argv[3]) # sets the limit for how many words will be shown tokens = tokenize(lines) # calls the tokenize function count_list = countWords(tokens, stopwords) # calls the count word function printTopMost(count_list, limit) # prints the top most words in correct order
def main(): inp_stop = open(sys.argv[1], encoding='utf8') if str(sys.argv[2])[:4].lower() == "http" or str(sys.argv[2])[:3].lower() == "www": response = urllib.request.urlopen(sys.argv[2]) lines = response.read().decode("utf8").splitlines() else: response = open(sys.argv[2], encoding='utf8') lines = response numberofwords = int(sys.argv[3]) stopwords = inp_stop.read().strip() # print(stopwords) words = wordfreq.tokenize(lines) countedWords = wordfreq.countWords(words, stopwords) topmost = wordfreq.printTopMost(countedWords, numberofwords) print(topmost) response.close() inp_stop.close()