def get_character_n_gram_set(file_name, n): char_n_grams = NGrams.get_character_n_grams( FileIo.get_text_file_contents(file_name), n) char_n_gram_set = set() for n_gram in char_n_grams: char_n_gram_set.add(n_gram) return char_n_gram_set
def main (): """ Main method to test functions """ fio = FileIo("../input2.txt") text = fio.getInput() p = re.compile(r'#?\d[\s\.]?[\s]?') out = filter(None, p.split(text)) #print out[2] #print len(out) wc = 0 for s in out: text = nltk.word_tokenize(s) wc += wordCount( text ) print wc
def get_word_n_gram_set(file_name, n): word_n_grams = NGrams.get_word_n_grams( FileIo.get_text_file_contents(file_name), n) word_n_gram_set = set() for n_gram in word_n_grams: word_n_gram_set.add(n_gram) return word_n_gram_set
def colorManyCell(self): list = FileIo.getDateList() qtColor = Qt.darkCyan brush = QBrush() brush.setColor(qtColor) for file in list: # date = QDate(2016, 11, 09) dlist = self.divDate(file) date = QDate(dlist[0], dlist[1], dlist[2]) cf = self.calMain.dateTextFormat(date) cf.setBackground(brush) self.calMain.setDateTextFormat(date, cf)
def dateClicked(self, date): self.currIndex = -1 self.emptyText() sDate = date.toString("yyyy년 MM월 dd일 (ddd)요일") self.chDate = date.toString("yyyyMMdd") self.lbDate.setText(sDate) # 아이템 삭제 self.emptyList() self.workStack.list.clear() """ 있던 없던 상관X """ # 파일 내용 saveData = FileIo.isAnyFile(self.chDate) if saveData : # 정렬 할 수있으면 하는것이 좋을듯 index를 기준으로 list = FileIo.getTasks(self.chDate) for t in list: self.addItems(t) self.setListView()
def __summarize_order_data(self): if self.data_set == FileIo.TRAINING_DATA_SET: data_files_path = OrderCategoricalLookup.OrderCategoricalLookup.TRAINING_DATA_ORDER_FILES_PATH data_files_list = OrderCategoricalLookup.OrderCategoricalLookup.TRAINING_DATA_ORDER_FILES else: data_files_path = OrderCategoricalLookup.OrderCategoricalLookup.TESTING_DATA_ORDER_FILES_PATH data_files_list = OrderCategoricalLookup.OrderCategoricalLookup.TESTING_DATA_ORDER_FILES for file_name in data_files_list: logging.info("RegressionInput: Summarizing orders in file " + file_name + " in " + self.data_set + " data") # Loop through the records and load the dictionary lookup for record in FileIo.get_text_file_contents(data_files_path + file_name): record_tokens = record.split(FileIo.TAB_CHARACTER) # If driver id is not present, skip the record order_driver_id = record_tokens[1] if order_driver_id == RegressionInput.DRIVER_NOT_FOUND: continue # Use the order for regression only if both start and end districts have POI and Traffic information # in both train and test environments if not self.traffic_lookup.district_has_traffic_info(district_hash=record_tokens[3]) or \ not self.traffic_lookup.district_has_traffic_info(district_hash=record_tokens[4]) or \ not self.poi_district_lookup.district_has_poi_info(district_hash=record_tokens[3]) or \ not self.poi_district_lookup.district_has_poi_info(district_hash=record_tokens[4]): continue # Create an order key to check if it already exists in summarized order data order_key = OrderKeyValue.OrderKey( order_start_district=record_tokens[3], order_destination_district=record_tokens[4], order_timestamp=record_tokens[6].strip()) if order_key in self.order_data: order_value = self.order_data[order_key] order_value.append_order_price(record_tokens[5]) self.order_data[order_key] = order_value else: self.order_data[order_key] = OrderKeyValue.OrderValue( order_price=record_tokens[5])
def __load_districts(self, load_training_districts): if load_training_districts: file_name = PoiDistrictLookup.TRAINING_DATA_POI_FILE_PATH else: file_name = PoiDistrictLookup.TESTING_DATA_POI_FILE_PATH for file_line in FileIo.get_text_file_contents(file_name): file_line_fields = file_line.split(FileIo.TAB_CHARACTER) if load_training_districts: self.unique_training_districts_with_poi_info.add( file_line_fields[0]) else: self.unique_testing_districts_with_poi_info.add( file_line_fields[0])
def __init__(self, poi_district_lookup): # Create storage for categorical data self.district_hashes = list() self.poi_district_lookup = poi_district_lookup unique_district_hashes = set() # Store all unique occurrences of categorical fields for data_set in [FileIo.TRAINING_DATA_SET, FileIo.TEST_DATA_SET]: logging.info("OrderCategoricalLookup: Going through data set " + data_set) if data_set == FileIo.TRAINING_DATA_SET: data_files_path = OrderCategoricalLookup.TRAINING_DATA_ORDER_FILES_PATH data_files_list = OrderCategoricalLookup.TRAINING_DATA_ORDER_FILES else: data_files_path = OrderCategoricalLookup.TESTING_DATA_ORDER_FILES_PATH data_files_list = OrderCategoricalLookup.TESTING_DATA_ORDER_FILES for file_name in data_files_list: logging.info("OrderCategoricalLookup: Going through file " + file_name + " in " + data_set + " data for finding all districts") # Loop through the records and load the dictionary lookup for record in FileIo.get_text_file_contents(data_files_path + file_name): record_tokens = record.split(FileIo.TAB_CHARACTER) if self.poi_district_lookup.district_has_poi_info( record_tokens[3]): unique_district_hashes.add(record_tokens[3]) if self.poi_district_lookup.district_has_poi_info( record_tokens[4]): unique_district_hashes.add(record_tokens[4]) # Store unique categorical field values self.district_hashes = list(unique_district_hashes) self.district_hashes.sort() logging.info("OrderCategoricalLookup: Found " + str(len(self.district_hashes)) + " districts")
def __init__(self, data_set): # Create storage to hold district data self.data_set = data_set self.district_data = dict() # Initialize file path based on which environment is being worked on data_files_path = None data_files_list = None if self.data_set == FileIo.TRAINING_DATA_SET: district_file_path = DistrictLookup.TRAINING_DATA_DISTRICT_FILE_PATH else: district_file_path = DistrictLookup.TESTING_DATA_DISTRICT_FILE_PATH # Fill lookup dictionary from the district file for record in FileIo.get_text_file_contents(district_file_path): record_tokens = record.split(FileIo.TAB_CHARACTER) self.district_data[record_tokens[0]] = record_tokens[1].strip()
def __init__(self, data_set): # Create storage to hold weather data self.data_set = data_set self.weather_data = None self.weather_data_keys = None self.all_weather_conditions = list() # Initialize files list based on which environment is being worked on data_files_path = None data_files_list = None if self.data_set == FileIo.TRAINING_DATA_SET: data_files_path = WeatherLookup.TRAINING_DATA_WEATHER_FILES_PATH data_files_list = WeatherLookup.TRAINING_DATA_WEATHER_FILES else: data_files_path = WeatherLookup.TESTING_DATA_WEATHER_FILES_PATH data_files_list = WeatherLookup.TESTING_DATA_WEATHER_FILES unsorted_weather_data = dict() # Fill lookup dictionary from data files for file_name in data_files_list: # Loop through the records and load the dictionary lookup for record in FileIo.get_text_file_contents(data_files_path + file_name): record_tokens = record.split(FileIo.TAB_CHARACTER) unsorted_weather_data[record_tokens[0]] \ = WeatherSnapshot(record_tokens[1], record_tokens[2], record_tokens[3].strip()) # Sort the weather data so that searching on timestamp is possible self.weather_data = OrderedDict( sorted(unsorted_weather_data.items(), key=lambda x: time.mktime( time.strptime(x[0], FileIo.TIMESTAMP_FORMAT)))) self.weather_data_keys = list(self.weather_data.keys()) self.__store_all_weather_conditions()
def __store_all_weather_conditions(self): data_files_path = None data_files_list = None weather_conditions_set = set() for data_set in [FileIo.TRAINING_DATA_SET, FileIo.TEST_DATA_SET]: if data_set == FileIo.TRAINING_DATA_SET: data_files_path = WeatherLookup.TRAINING_DATA_WEATHER_FILES_PATH data_files_list = WeatherLookup.TRAINING_DATA_WEATHER_FILES else: data_files_path = WeatherLookup.TESTING_DATA_WEATHER_FILES_PATH data_files_list = WeatherLookup.TESTING_DATA_WEATHER_FILES for file_name in data_files_list: # Loop through the records and load the dictionary lookup for record in FileIo.get_text_file_contents(data_files_path + file_name): record_tokens = record.split(FileIo.TAB_CHARACTER) weather_conditions_set.add(int(record_tokens[1])) self.all_weather_conditions = list(weather_conditions_set) self.all_weather_conditions.sort()
def saveEvent(self): if not self.workStack.list : FileIo.delFile(self.chDate) self.colorCal("2") else : bTask = self.workStack.list.pop() bTask.title = self.title.text() bTask.content = self.content.toPlainText() self.workStack.list.append(bTask) xml = XmlMaker.mkXmlTotal(self.workStack.list) FileIo.newXml(xml, self.chDate) self.colorCal() saveData = FileIo.isAnyFile(self.chDate) if saveData: self.emptyList() self.workStack.list.clear() # 정렬 할 수있으면 하는것이 좋을듯 index를 기준으로 list = FileIo.getTasks(self.chDate) for t in list: self.addItems(t)
def __get_salt_file_character_n_grams(self): char_n_grams = NGrams.get_character_n_grams(FileIo.get_text_file_contents(SALT_FILE_NAME), N_GRAMS_SIZE) char_n_gram_set = set() for n_gram in char_n_grams: char_n_gram_set.add(n_gram) return list(char_n_gram_set)
def writeToFile(self): file = open('user_input.txt', 'w') file.write(self.text_area.get("1.0",END+"-1c")) file.close() ############################################################################## # initialize variables input = "user_input.txt" fio = FileIo(input) ip = InputProcessor() bp = BlockProcessor() # initial setup, process input, tokenize, find parts of speech the2DArray = ip.processInput(fio.getFile()) the2DArray = bp.removeCommas(the2DArray) tokenized = ip.tokenize(the2DArray) pos = bp.posTagger(the2DArray) ############################################################################## # noun and verb phrase chunking chunkPattern = """ NP: {<DT|PP\$>?<CD>?(<JJ>|<JJR>|<JJS>)*(<NN>|<NNP>|<NNPS>|<NNS>|<POS>)+} {<NNP>+} {<NN>+} {<PRP>+} {<DT><JJ>} VP: {<MD|TO|RB>?<VB.*>+<RB>?<VB.*>?} {<VB.*>+} """ phraseChunk = bp.phraseChunker(tokenized, chunkPattern) #for tree in phraseChunk: # print tree ############################################################################## # count nouns per block and total, update the2DArray nounDict = bp.countNouns(pos) for key, value in nounDict.iteritems() : if key is 'total': totalNouns = value else: the2DArray = bp.updateArray(the2DArray,key,'nounCount',value) ############################################################################## # count verbs per block and total, update the2DArray verbDict = bp.countVerbs(pos) for key, value in verbDict.iteritems() : if key is 'total': totalVerbs = value else: the2DArray = bp.updateArray(the2DArray,key,'verbCount',value) ############################################################################## # count adjectives per block and total, update the2DArray adjectiveDict = bp.countAdjectives(pos) for key, value in adjectiveDict.iteritems() : if key is 'total': totalAdjectives = value else: the2DArray = bp.updateArray(the2DArray,key,'adjectiveCount',value) ############################################################################## # count pronouns per block and total, update the2DArray pronounDict = bp.countPronouns(pos) for key, value in pronounDict.iteritems() : if key is 'total': totalPronouns = value else: the2DArray = bp.updateArray(the2DArray,key,'pronounCount',value) ############################################################################## # count adverbs per block and total, update the2DArray adverbDict = bp.countAdverbs(pos) for key, value in adverbDict.iteritems() : if key is 'total': totalAdverbs = value else: the2DArray = bp.updateArray(the2DArray,key,'adverbCount',value) ############################################################################## # count other parts of speech per block and total, update the2DArray otherDict = bp.countOther(pos) for key, value in otherDict.iteritems() : if key is 'total': totalOther = value else: the2DArray = bp.updateArray(the2DArray,key,'otherCount',value) ############################################################################## # count words per block and total, update the2DArray wordCountDict = bp.wordCount(tokenized) for key, value in wordCountDict.iteritems() : if key is 'total': totalWordCount = value else: the2DArray = bp.updateArray(the2DArray,key,'totalWordCount',value) ############################################################################## # update the2DArray with totals the2DArray = bp.updateArray(the2DArray,len(the2DArray)-1,'nounCount',totalNouns) the2DArray = bp.updateArray(the2DArray,len(the2DArray)-1,'verbCount',totalVerbs) the2DArray = bp.updateArray(the2DArray,len(the2DArray)-1,'adjectiveCount',totalAdjectives) the2DArray = bp.updateArray(the2DArray,len(the2DArray)-1,'pronounCount',totalPronouns) the2DArray = bp.updateArray(the2DArray,len(the2DArray)-1,'adverbCount',totalAdverbs) the2DArray = bp.updateArray(the2DArray,len(the2DArray)-1,'otherCount',totalOther) the2DArray = bp.updateArray(the2DArray,len(the2DArray)-1,'totalWordCount',totalWordCount) ############################################################################## # process distinct word count and TF-IDF distinctWordCountArray = bp.distinctWordCount(tokenized) tf_idfArray = bp.tf_idf_Count(tokenized) ############################################################################## # ask user for directory name where the output csv files will be saved to dirname = tkFileDialog.askdirectory(initialdir="/",title="Choose Directory Location for Results") outputDirBase = dirname + '/' # csv result files will be located in teamNLP file followed by a number # if one or more exist already in the user directory location count = 1 baseName = 'teamNLP' outputFileName = outputDirBase + baseName while (os.path.exists(outputFileName)): # while the directory name exists count += 1 # increment the counter... outputFileName = outputDirBase + baseName + str(count) os.mkdir(outputFileName) # create folder in user's chosen directory location numpy.savetxt(outputFileName + '/the2DArray.csv', the2DArray, delimiter=",", fmt="%s") numpy.savetxt(outputFileName + '/distinctWordCountArray.csv', distinctWordCountArray, delimiter=",", fmt="%s") numpy.savetxt(outputFileName + '/tf_idfArray.csv', tf_idfArray, delimiter=",", fmt="%s")
#! /usr/bin/env python from FileIo import * from InputProcessor import * from BlockProcessor import * import nltk from Switch import * import sys import os if __name__ == "__main__": os.system('clear') input = "../input.txt" #input = "input2" fio = FileIo(input) ip = InputProcessor() bp = BlockProcessor() processInput = ip.processInput(fio.getFile()) tokenized = ip.tokenize(processInput) pos = bp.posTagger(processInput) print "Original input text:" print "###################################################################################\n\n" fio.toString(); print "\n###################################################################################\n\n" if (len(sys.argv) == 2): choice = str(sys.argv[1]) else:
def __init__(self, data_set): # Create storage to hold traffic data self.data_set = data_set self.traffic_data = list() self.traffic_data_keys = None self.districts_with_traffic_info = list() # Sets for each of the environments unique_districts_with_traffic_info_1 = set() unique_districts_with_traffic_info_2 = set() # Initialize files list based on which environment is being worked on data_files_path = None data_files_list = None if self.data_set == FileIo.TRAINING_DATA_SET: data_files_path = TrafficLookup.TRAINING_DATA_TRAFFIC_FILES_PATH data_files_list = TrafficLookup.TRAINING_DATA_TRAFFIC_FILES else: data_files_path = TrafficLookup.TESTING_DATA_TRAFFIC_FILES_PATH data_files_list = TrafficLookup.TESTING_DATA_TRAFFIC_FILES # Fill lookup dictionary from data files for file_name in data_files_list: # Loop through the records and load the dictionary lookup for record in FileIo.get_text_file_contents(data_files_path + file_name): traffic_record = list() record_tokens = record.split(FileIo.TAB_CHARACTER) # Add district to the list of districts with traffic info unique_districts_with_traffic_info_1.add(record_tokens[0]) # Separate out district hash, traffic at the four congestion levels and timestamp traffic_record.append(record_tokens[0]) traffic_record.\ append(int(record_tokens[1].split(TrafficLookup.TRAFFIC_LEVEL_AND_ROAD_SECTION_SEPARATOR)[1])) traffic_record.\ append(int(record_tokens[2].split(TrafficLookup.TRAFFIC_LEVEL_AND_ROAD_SECTION_SEPARATOR)[1])) traffic_record.\ append(int(record_tokens[3].split(TrafficLookup.TRAFFIC_LEVEL_AND_ROAD_SECTION_SEPARATOR)[1])) traffic_record.\ append(int(record_tokens[4].split(TrafficLookup.TRAFFIC_LEVEL_AND_ROAD_SECTION_SEPARATOR)[1])) traffic_record.\ append(time.mktime(time.strptime(record_tokens[5].strip(), FileIo.TIMESTAMP_FORMAT))) self.traffic_data.append(traffic_record) # Sort the traffic data so that searching on district hash and timestamp is possible self.traffic_data.sort(key=lambda x: (x[0], x[5])) self.traffic_data_keys = [[record[0], record[5]] for record in self.traffic_data] # Loop through the other data set and add districts to the to list of districts with traffic info if self.data_set == FileIo.TRAINING_DATA_SET: data_files_path = TrafficLookup.TESTING_DATA_TRAFFIC_FILES_PATH data_files_list = TrafficLookup.TESTING_DATA_TRAFFIC_FILES else: data_files_path = TrafficLookup.TRAINING_DATA_TRAFFIC_FILES_PATH data_files_list = TrafficLookup.TRAINING_DATA_TRAFFIC_FILES for file_name in data_files_list: # Loop through the records and add districts to the list of districts with traffic info for record in FileIo.get_text_file_contents(data_files_path + file_name): record_tokens = record.split(FileIo.TAB_CHARACTER) unique_districts_with_traffic_info_2.add(record_tokens[0]) # Save the districts with traffic info self.districts_with_traffic_info = list( unique_districts_with_traffic_info_1.intersection( unique_districts_with_traffic_info_2)) unique_districts_with_traffic_info_1 = None unique_districts_with_traffic_info_2 = None