コード例 #1
0
 def printTopMost(freq, n):
     saved = sys.stdout
     sys.stdout = io.StringIO()
     wordfreq.printTopMost(freq, n)
     out = sys.stdout.getvalue()
     sys.stdout = saved
     return out
コード例 #2
0
def main():
    arg1 = sys.argv[1]
    arg2 = sys.argv[2]
    urlInput = False

    if arg2.startswith("http://") or arg2.startswith("https://"):
        urlInput = True
    
    inp_file1 = open(arg1)
    if urlInput:
        response = urllib.request.urlopen(arg2)
        inp_file2 = response.read().decode("utf8").splitlines()
    else:
        inp_file2 = open(arg2)
    
    numPrints = int(sys.argv[3])

    tokenizedLines = wordfreq.tokenize(inp_file2)
    tokenizedStopWords = wordfreq.tokenize(inp_file1)

    inp_file1.close()
    if not urlInput:
        inp_file2.close()
        
    frequencies = wordfreq.countWords(tokenizedLines, tokenizedStopWords)
    wordfreq.printTopMost(frequencies, numPrints)
コード例 #3
0
def main():
    #Add all stopwords to a List
    stop_file = open(sys.argv[1], encoding="utf-8")
    stop_words = []
    for stop in stop_file:
        stop_words.append(stop.strip())
    stop_file.close()

    inp_file = ""
    #Check if file points to local dir or http
    if (str(sys.argv[2]).startswith('http://')
            or str(sys.argv[2]).startswith('https://')):
        response = urllib.request.urlopen(sys.argv[2])
        inp_file = response.read().decode("utf8").splitlines()
    else:
        local_file = open(sys.argv[2], encoding="utf-8")
        inp_file = local_file.read().splitlines()
        local_file.close()

    #Split all words
    t_file = w.tokenize(inp_file)
    #Count words
    countDic = w.countWords(t_file, stop_words)
    #Print top N
    w.printTopMost(countDic, int(sys.argv[3]))
コード例 #4
0
def main():
    f1 = open(sys.argv[1], encoding="utf-8")
    stops = []
    for line in f1:
        stops.append(line.strip())
    f1.close()
    text = check(sys.argv[2])
    tokenz = wordfreq.tokenize(text)
    freks = wordfreq.countWords(tokenz, stops)
    wordfreq.printTopMost(freks, int(sys.argv[3]))
コード例 #5
0
ファイル: topmost.py プロジェクト: hugodrak/ChalmersProg
def main():
    # Loads the files needed for the execution
    stopwords = open(sys.argv[1], "r", encoding="utf8").read().splitlines()

    text_source = sys.argv[2]

    if '://' in text_source:
        # text-source is an internet url. Download over internet
        text = urllib.request.urlopen(text_source).read().decode('utf-8')
    else:
        # text-source is a local file
        with open(text_source, mode='r', encoding='utf-8') as f:
            text = f.read()

    # Split into a list of lines
    lines = text.splitlines()

    limit = int(sys.argv[3])  # sets the limit for how many words will be shown
    tokens = tokenize(lines)  # calls the tokenize function
    count_list = countWords(tokens, stopwords)  # calls the count word function
    printTopMost(count_list, limit)  # prints the top most words in correct order
コード例 #6
0
def main():

    inp_stop = open(sys.argv[1], encoding='utf8')

    if str(sys.argv[2])[:4].lower() == "http" or str(sys.argv[2])[:3].lower() == "www":
        response = urllib.request.urlopen(sys.argv[2])
        lines = response.read().decode("utf8").splitlines()
    else:
        response = open(sys.argv[2], encoding='utf8')
        lines = response

    numberofwords = int(sys.argv[3])
    stopwords = inp_stop.read().strip()
    # print(stopwords)

    words = wordfreq.tokenize(lines)
    countedWords = wordfreq.countWords(words, stopwords)
    topmost = wordfreq.printTopMost(countedWords, numberofwords)

    print(topmost)

    response.close()
    inp_stop.close()