Python Utilities.tokenizeFile 예제들

프로그래밍 언어: Python

클래스/타입: Utilities

메소드/함수: tokenizeFile

hotexamples.com에서의 예제들: 7

Python Utilities.tokenizeFile - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 Utilities.tokenizeFile 패키지로부터 facebook_page_scraper에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

format_CIK(13)

write_result_to_file(10)

AverageMeter(9)

mkdir(9)

tokenizeFile(7)

init_distribution(7)

get_err_from_predict(7)

sanitize_filing_year(6)

failExecution(6)

setup_db(5)

printFrequencies(5)

connect(5)

replace_zero_label_with_neg_one(5)

insert_entries(5)

pre_compute_threshes(4)

getIndentSize(4)

print_to_file(3)

findNextNonWhiteSpaceCharIndex(3)

getPyQt4ModulesDirectory(3)

get_alpha_numeric_count(3)

pwDecode(3)

get_random_name(3)

get_suffix(3)

printInfo(3)

rmTree(3)

is_CIK_valid(3)

parse_time(3)

isInsideTextLiteral(3)

get_prefix(3)

check_number(3)

check_capital(3)

check_bar(3)

get_f_ranking_from_predictions(2)

from_dungeon_level(2)

rot_center(2)

pwEncode(2)

get_auc_from_predict(2)

BhattacharyaCoeff(2)

checkBlacklistedVersions(2)

maximalElements(2)

collateFrequencies(2)

chat(2)

pre_compute_threshes_uci(2)

pre_compute_threshes_8news(2)

character_counter(2)

isSubList(2)

is_inside_frustum(2)

listMerge(2)

loadAll(2)

normalise_plurk_id(2)

예제 #1

파일 보기

파일: CrawlerUtil.py 프로젝트: TileThePlane/crawler4py

def crawlerWordFrequencies():
     '''For each file in directory gather all text from text meaningful tags, tokenize that, then add it to a WordFrequencyCounter;
     repeat for each file.''' 
     def freqSort(x):
          '''
          sorts by greatest frequency and then alphabetically
          '''
          return (-x[1], x[0])
     
     parsedCount = 1
     wordFreqs = dict()
     for fileName in os.listdir("data/content"):
          if os.stat("data/content/"+fileName)[6] > 0:
               try:
                    with open("data/content/"+fileName) as mUp:
                         wordFreqs.update(wordFrequencyCount(Utilities.tokenizeFile(BeautifulSoup(mUp.read()).get_text())))
               except:
                    continue
               finally:
                    parsedCount+=1  
     
     tupForm = [(k,v) for k,v in wordFreqs.items()]
     topFH = 0
     for item in sorted(tupForm,key = lambda x: freqSort(x)): 
          if topFH == 500:
               break
          with open('CommonWords.txt','a') as f:
               f.write('{0:<20} {1}\n'.format(str(item[0]),str(item[1])))
          topFH +=1
          
     print('Completed Word Frequency Accumulation')

예제 #2

파일 보기

파일: PalindromeFrequencyCounter.py 프로젝트: colinlma/text_processing

def main(filename: 'str') -> None:
    '''
    Finds the palindromes in the file and prints the number of occurences of them.
    '''
    words = Utilities.tokenizeFile(filename)
    frequencies = computePalindromeFrequencies(words)
    Utilities.printFrequencies(frequencies)

예제 #3

파일 보기

파일: TwoGramFrequencyCounter.py 프로젝트: colinlma/text_processing

def main(filename: 'str') -> None:
    '''
    Computes two word combinations and outputs them and their
    frequency
    '''
    words = Utilities.tokenizeFile(filename)
    frequencies = computeTwoGramFrequencies(words)
    Utilities.printFrequencies(frequencies)

예제 #4

파일 보기

파일: WordFrequencyCounter.py 프로젝트: colinlma/text_processing

def main(filename: 'str') -> None:
    '''
    Computes word tokens and outputs them and their
    frequency
    '''
    words = Utilities.tokenizeFile(filename)#input file name here
    #make sure it's in the same directory
    frequencies = computeWordFrequencies(words)
    Utilities.printFrequencies(frequencies)

예제 #5

파일 보기

파일: CrawlerUtil.py 프로젝트: TileThePlane/INF141-crawlerUtils

def crawlerWordFrequencies():
     '''For each file in directory gather all text from text meaningful tags, tokenize that, then add it to a WordFrequencyCounter;
     repeat for each file.''' 
     def freqSort(x):
          '''
          sorts by greatest frequency and then alphabetically
          '''
          return (-x[1], x[0])
     longestText = ''
     wordFreqs = dict()
     for fileName in os.listdir("data/content"):
          if os.stat("data/content/"+fileName)[6] > 0:
               try:
                    with open("data/content/"+fileName) as mUp:
                         text = BeautifulSoup(mUp.read()).get_text()
                         wordFreqs.update(wordFrequencyCount(Utilities.tokenizeFile(text)))
               except:
                    continue
               finally:
                    if len(text) > len(longestText):
                         longestText = text
                         longestTextFileName = fileName
          print(fileName)
     with open("longestPage.txt",'a') as lPage:
          lPage.write(longestTextFileName+'\n')
          lPage.write(longestText)
                     
     tupForm = [(k,v) for k,v in wordFreqs.items()]
     topFH = 0
     for item in sorted(tupForm,key = lambda x: freqSort(x)): 
          print(item[1])
          if topFH == 500:
               break
          with open('CommonWords.txt','a') as f:
               f.write(str(item[0])+'\n')
          topFH +=1
          
     print('Completed Word Frequency Accumulation')

예제 #6

파일 보기

파일: PalindromeFrequencyCounter.py 프로젝트: TileThePlane/crawler4py

     if not tokens: #check if list is empty, [] == False
          return []
     
     tempFreq = defaultdict(int) 
     palindromeAccumulator = '' '' #The variable accumulates tokens into a string until a palindrome is form  
     tokens.append('addOne') #For the algorithm to work it is necessary that the last token ends a palindrome
     
     for i in range(len(tokens)): 
          '''
          iterates for the length of the tokens list. Each iteration checks palindromAccumulator and the reverse of palindrome accumulator twice.
          if checks for non-matches and catches the empty string(entry case). When true, appends token[i] to palindromeAccumulator
          else there is a palindrome. tempFreq is incremented and palindromeAccumulator is reset to token[i]
          '''
          if palindromeAccumulator != palindromeAccumulator[::-1] or palindromeAccumulator == '': # compare pal and revesed pal; check for pal
               palindromeAccumulator += tokens[i] #concatenate tokens[i] to palindromeAccumulator
          else:
               tempFreq[palindromeAccumulator] += 1 # assign value to dict 
               palindromeAccumulator = tokens[i] #reset to palindromeAccumulator to tokens[i]
          
     return Utilities.collateFrequencies(tempFreq) 

if __name__ == '__main__':
     #createTestFile()
     Utilities.printFrequencies(palindromeFrequencyCount(Utilities.tokenizeFile(open('test.txt').read())))

예제 #7

파일 보기

파일: TwoGramFrequencyCounter.py 프로젝트: TileThePlane/crawler4py

TwoGramFrequencyCounter.py
Python 34

'''
import Utilities
from Frequency import Frequency
from collections import defaultdict

def twoGramFrequencyCount(tokens : [str]) -> [Frequency]:
     '''
     Counts frequency of 2grams from a tokenized list. 
     '''
     if not tokens: #check if list is empty, [] == False
          return []
     
     tokens = list(filter(lambda x: x !='', tokens))    #filters out empty strings
     
     tempFreq = defaultdict(int)
     
     for twoGram in zip(tokens,tokens[1:]): # for (token,token) in [(token,token)]
          '''
          iterate over a list of the combined list of words and the off-set list of words to create the twoGrams,
          using a defaultdict(int) with the 2gram as key and count duplicates for value
          '''
          tempFreq[' '.join(list(twoGram))] += 1 # tempFreq['token token'] = frequency value
     
     return Utilities.collateFrequencies(tempFreq)

if __name__ == '__main__':
     Utilities.printFrequencies(twoGramFrequencyCount(Utilities.tokenizeFile(open('xyz.txt').read())))