Ejemplo n.º 1
0
def get_character_n_gram_set(file_name, n):
    char_n_grams = NGrams.get_character_n_grams(
        FileIo.get_text_file_contents(file_name), n)
    char_n_gram_set = set()
    for n_gram in char_n_grams:
        char_n_gram_set.add(n_gram)
    return char_n_gram_set
Ejemplo n.º 2
0
def main ():
    """
    Main method to test functions

    """
    fio = FileIo("../input2.txt")
    text = fio.getInput()
    p = re.compile(r'#?\d[\s\.]?[\s]?')
    out = filter(None, p.split(text))
    #print out[2]
    #print len(out)
    wc = 0

    for s in out:
        text = nltk.word_tokenize(s)
        wc += wordCount( text )
    print wc
Ejemplo n.º 3
0
def get_word_n_gram_set(file_name, n):

    word_n_grams = NGrams.get_word_n_grams(
        FileIo.get_text_file_contents(file_name), n)
    word_n_gram_set = set()
    for n_gram in word_n_grams:
        word_n_gram_set.add(n_gram)
    return word_n_gram_set
Ejemplo n.º 4
0
 def colorManyCell(self):
     list = FileIo.getDateList()
     qtColor = Qt.darkCyan
     brush = QBrush()
     brush.setColor(qtColor)
     for file in list:
         # date = QDate(2016, 11, 09)
         dlist = self.divDate(file)
         date = QDate(dlist[0], dlist[1], dlist[2])
         cf = self.calMain.dateTextFormat(date)
         cf.setBackground(brush)
         self.calMain.setDateTextFormat(date, cf)
Ejemplo n.º 5
0
    def dateClicked(self, date):
        self.currIndex = -1
        self.emptyText()
        sDate = date.toString("yyyy년 MM월 dd일 (ddd)요일")
        self.chDate = date.toString("yyyyMMdd")
        self.lbDate.setText(sDate)
        # 아이템 삭제
        self.emptyList()
        self.workStack.list.clear()


        """
        있던 없던 상관X
        """
        # 파일 내용
        saveData = FileIo.isAnyFile(self.chDate)
        if saveData :
            # 정렬 할 수있으면 하는것이 좋을듯 index를 기준으로
            list = FileIo.getTasks(self.chDate)
            for t in list:
                self.addItems(t)

        self.setListView()
Ejemplo n.º 6
0
    def __summarize_order_data(self):

        if self.data_set == FileIo.TRAINING_DATA_SET:
            data_files_path = OrderCategoricalLookup.OrderCategoricalLookup.TRAINING_DATA_ORDER_FILES_PATH
            data_files_list = OrderCategoricalLookup.OrderCategoricalLookup.TRAINING_DATA_ORDER_FILES

        else:
            data_files_path = OrderCategoricalLookup.OrderCategoricalLookup.TESTING_DATA_ORDER_FILES_PATH
            data_files_list = OrderCategoricalLookup.OrderCategoricalLookup.TESTING_DATA_ORDER_FILES

        for file_name in data_files_list:

            logging.info("RegressionInput: Summarizing orders in file " +
                         file_name + " in " + self.data_set + " data")

            # Loop through the records and load the dictionary lookup
            for record in FileIo.get_text_file_contents(data_files_path +
                                                        file_name):

                record_tokens = record.split(FileIo.TAB_CHARACTER)

                # If driver id is not present, skip the record
                order_driver_id = record_tokens[1]
                if order_driver_id == RegressionInput.DRIVER_NOT_FOUND:
                    continue

                # Use the order for regression only if both start and end districts have POI and Traffic information
                # in both train and test environments
                if not self.traffic_lookup.district_has_traffic_info(district_hash=record_tokens[3]) or \
                   not self.traffic_lookup.district_has_traffic_info(district_hash=record_tokens[4]) or \
                   not self.poi_district_lookup.district_has_poi_info(district_hash=record_tokens[3]) or \
                   not self.poi_district_lookup.district_has_poi_info(district_hash=record_tokens[4]):
                    continue

                # Create an order key to check if it already exists in summarized order data
                order_key = OrderKeyValue.OrderKey(
                    order_start_district=record_tokens[3],
                    order_destination_district=record_tokens[4],
                    order_timestamp=record_tokens[6].strip())

                if order_key in self.order_data:
                    order_value = self.order_data[order_key]
                    order_value.append_order_price(record_tokens[5])
                    self.order_data[order_key] = order_value
                else:
                    self.order_data[order_key] = OrderKeyValue.OrderValue(
                        order_price=record_tokens[5])
Ejemplo n.º 7
0
    def __load_districts(self, load_training_districts):

        if load_training_districts:
            file_name = PoiDistrictLookup.TRAINING_DATA_POI_FILE_PATH
        else:
            file_name = PoiDistrictLookup.TESTING_DATA_POI_FILE_PATH

        for file_line in FileIo.get_text_file_contents(file_name):

            file_line_fields = file_line.split(FileIo.TAB_CHARACTER)

            if load_training_districts:
                self.unique_training_districts_with_poi_info.add(
                    file_line_fields[0])
            else:
                self.unique_testing_districts_with_poi_info.add(
                    file_line_fields[0])
    def __init__(self, poi_district_lookup):

        # Create storage for categorical data
        self.district_hashes = list()
        self.poi_district_lookup = poi_district_lookup

        unique_district_hashes = set()

        # Store all unique occurrences of categorical fields
        for data_set in [FileIo.TRAINING_DATA_SET, FileIo.TEST_DATA_SET]:

            logging.info("OrderCategoricalLookup: Going through data set " +
                         data_set)

            if data_set == FileIo.TRAINING_DATA_SET:
                data_files_path = OrderCategoricalLookup.TRAINING_DATA_ORDER_FILES_PATH
                data_files_list = OrderCategoricalLookup.TRAINING_DATA_ORDER_FILES
            else:
                data_files_path = OrderCategoricalLookup.TESTING_DATA_ORDER_FILES_PATH
                data_files_list = OrderCategoricalLookup.TESTING_DATA_ORDER_FILES

            for file_name in data_files_list:

                logging.info("OrderCategoricalLookup: Going through file " +
                             file_name + " in " + data_set +
                             " data for finding all districts")

                # Loop through the records and load the dictionary lookup
                for record in FileIo.get_text_file_contents(data_files_path +
                                                            file_name):
                    record_tokens = record.split(FileIo.TAB_CHARACTER)
                    if self.poi_district_lookup.district_has_poi_info(
                            record_tokens[3]):
                        unique_district_hashes.add(record_tokens[3])
                    if self.poi_district_lookup.district_has_poi_info(
                            record_tokens[4]):
                        unique_district_hashes.add(record_tokens[4])

        # Store unique categorical field values
        self.district_hashes = list(unique_district_hashes)
        self.district_hashes.sort()

        logging.info("OrderCategoricalLookup: Found " +
                     str(len(self.district_hashes)) + " districts")
Ejemplo n.º 9
0
    def __init__(self, data_set):

        # Create storage to hold district data
        self.data_set = data_set
        self.district_data = dict()

        # Initialize file path based on which environment is being worked on
        data_files_path = None
        data_files_list = None
        if self.data_set == FileIo.TRAINING_DATA_SET:
            district_file_path = DistrictLookup.TRAINING_DATA_DISTRICT_FILE_PATH
        else:
            district_file_path = DistrictLookup.TESTING_DATA_DISTRICT_FILE_PATH

        # Fill lookup dictionary from the district file
        for record in FileIo.get_text_file_contents(district_file_path):

            record_tokens = record.split(FileIo.TAB_CHARACTER)
            self.district_data[record_tokens[0]] = record_tokens[1].strip()
    def __init__(self, data_set):

        # Create storage to hold weather data
        self.data_set = data_set
        self.weather_data = None
        self.weather_data_keys = None
        self.all_weather_conditions = list()

        # Initialize files list based on which environment is being worked on
        data_files_path = None
        data_files_list = None
        if self.data_set == FileIo.TRAINING_DATA_SET:
            data_files_path = WeatherLookup.TRAINING_DATA_WEATHER_FILES_PATH
            data_files_list = WeatherLookup.TRAINING_DATA_WEATHER_FILES
        else:
            data_files_path = WeatherLookup.TESTING_DATA_WEATHER_FILES_PATH
            data_files_list = WeatherLookup.TESTING_DATA_WEATHER_FILES

        unsorted_weather_data = dict()
        # Fill lookup dictionary from data files
        for file_name in data_files_list:

            # Loop through the records and load the dictionary lookup
            for record in FileIo.get_text_file_contents(data_files_path +
                                                        file_name):

                record_tokens = record.split(FileIo.TAB_CHARACTER)
                unsorted_weather_data[record_tokens[0]] \
                    = WeatherSnapshot(record_tokens[1], record_tokens[2], record_tokens[3].strip())

        # Sort the weather data so that searching on timestamp is possible
        self.weather_data = OrderedDict(
            sorted(unsorted_weather_data.items(),
                   key=lambda x: time.mktime(
                       time.strptime(x[0], FileIo.TIMESTAMP_FORMAT))))
        self.weather_data_keys = list(self.weather_data.keys())

        self.__store_all_weather_conditions()
    def __store_all_weather_conditions(self):

        data_files_path = None
        data_files_list = None
        weather_conditions_set = set()
        for data_set in [FileIo.TRAINING_DATA_SET, FileIo.TEST_DATA_SET]:

            if data_set == FileIo.TRAINING_DATA_SET:
                data_files_path = WeatherLookup.TRAINING_DATA_WEATHER_FILES_PATH
                data_files_list = WeatherLookup.TRAINING_DATA_WEATHER_FILES
            else:
                data_files_path = WeatherLookup.TESTING_DATA_WEATHER_FILES_PATH
                data_files_list = WeatherLookup.TESTING_DATA_WEATHER_FILES

            for file_name in data_files_list:

                # Loop through the records and load the dictionary lookup
                for record in FileIo.get_text_file_contents(data_files_path +
                                                            file_name):
                    record_tokens = record.split(FileIo.TAB_CHARACTER)
                    weather_conditions_set.add(int(record_tokens[1]))

        self.all_weather_conditions = list(weather_conditions_set)
        self.all_weather_conditions.sort()
Ejemplo n.º 12
0
 def saveEvent(self):
     if not self.workStack.list :
         FileIo.delFile(self.chDate)
         self.colorCal("2")
     else :
         bTask =  self.workStack.list.pop()
         bTask.title = self.title.text()
         bTask.content = self.content.toPlainText()
         self.workStack.list.append(bTask)
         xml = XmlMaker.mkXmlTotal(self.workStack.list)
         FileIo.newXml(xml, self.chDate)
         self.colorCal()
     saveData = FileIo.isAnyFile(self.chDate)
     if saveData:
         self.emptyList()
         self.workStack.list.clear()
         # 정렬 할 수있으면 하는것이 좋을듯 index를 기준으로
         list = FileIo.getTasks(self.chDate)
         for t in list:
             self.addItems(t)
 def __get_salt_file_character_n_grams(self):
     char_n_grams = NGrams.get_character_n_grams(FileIo.get_text_file_contents(SALT_FILE_NAME), N_GRAMS_SIZE)
     char_n_gram_set = set()
     for n_gram in char_n_grams:
         char_n_gram_set.add(n_gram)
     return list(char_n_gram_set)
Ejemplo n.º 14
0
    def writeToFile(self):
        file = open('user_input.txt', 'w')
        file.write(self.text_area.get("1.0",END+"-1c"))
        file.close()

        ##############################################################################

        # initialize variables
        input = "user_input.txt"
        fio = FileIo(input)
        ip = InputProcessor()
        bp = BlockProcessor()

        # initial setup, process input, tokenize, find parts of speech
        the2DArray = ip.processInput(fio.getFile())
        the2DArray = bp.removeCommas(the2DArray)
        tokenized = ip.tokenize(the2DArray)
        pos = bp.posTagger(the2DArray)


        ##############################################################################

        # noun and verb phrase chunking
        chunkPattern = """
                NP: {<DT|PP\$>?<CD>?(<JJ>|<JJR>|<JJS>)*(<NN>|<NNP>|<NNPS>|<NNS>|<POS>)+}
                {<NNP>+}
                {<NN>+}
                {<PRP>+}
                {<DT><JJ>}
            
                VP: {<MD|TO|RB>?<VB.*>+<RB>?<VB.*>?}
                {<VB.*>+}
                """
        phraseChunk = bp.phraseChunker(tokenized, chunkPattern)
        #for tree in phraseChunk:
        #    print tree

        ##############################################################################

        # count nouns per block and total, update the2DArray
        nounDict = bp.countNouns(pos)
        for key, value in nounDict.iteritems() :
            if key is 'total':
                totalNouns = value
            else:
                the2DArray = bp.updateArray(the2DArray,key,'nounCount',value)

        ##############################################################################

        # count verbs per block and total, update the2DArray
        verbDict = bp.countVerbs(pos)
        for key, value in verbDict.iteritems() :
            if key is 'total':
                totalVerbs = value
            else:
                the2DArray = bp.updateArray(the2DArray,key,'verbCount',value)

        ##############################################################################

        # count adjectives per block and total, update the2DArray
        adjectiveDict = bp.countAdjectives(pos)
        for key, value in adjectiveDict.iteritems() :
            if key is 'total':
                totalAdjectives = value
            else:
                the2DArray = bp.updateArray(the2DArray,key,'adjectiveCount',value)

        ##############################################################################

        # count pronouns per block and total, update the2DArray
        pronounDict = bp.countPronouns(pos)
        for key, value in pronounDict.iteritems() :
            if key is 'total':
                totalPronouns = value
            else:
                the2DArray = bp.updateArray(the2DArray,key,'pronounCount',value)

        ##############################################################################

        # count adverbs per block and total, update the2DArray
        adverbDict = bp.countAdverbs(pos)
        for key, value in adverbDict.iteritems() :
            if key is 'total':
                totalAdverbs = value
            else:
                the2DArray = bp.updateArray(the2DArray,key,'adverbCount',value)

        ##############################################################################

        # count other parts of speech per block and total, update the2DArray
        otherDict = bp.countOther(pos)
        for key, value in otherDict.iteritems() :
            if key is 'total':
                totalOther = value
            else:
                the2DArray = bp.updateArray(the2DArray,key,'otherCount',value)

        ##############################################################################

        # count words per block and total, update the2DArray
        wordCountDict = bp.wordCount(tokenized)
        for key, value in wordCountDict.iteritems() :
            if key is 'total':
                totalWordCount = value
            else:
                the2DArray = bp.updateArray(the2DArray,key,'totalWordCount',value)

        ##############################################################################

        # update the2DArray with totals
        the2DArray = bp.updateArray(the2DArray,len(the2DArray)-1,'nounCount',totalNouns)
        the2DArray = bp.updateArray(the2DArray,len(the2DArray)-1,'verbCount',totalVerbs)
        the2DArray = bp.updateArray(the2DArray,len(the2DArray)-1,'adjectiveCount',totalAdjectives)
        the2DArray = bp.updateArray(the2DArray,len(the2DArray)-1,'pronounCount',totalPronouns)
        the2DArray = bp.updateArray(the2DArray,len(the2DArray)-1,'adverbCount',totalAdverbs)
        the2DArray = bp.updateArray(the2DArray,len(the2DArray)-1,'otherCount',totalOther)
        the2DArray = bp.updateArray(the2DArray,len(the2DArray)-1,'totalWordCount',totalWordCount)

        ##############################################################################

        # process distinct word count and TF-IDF 
        distinctWordCountArray = bp.distinctWordCount(tokenized)
        tf_idfArray = bp.tf_idf_Count(tokenized)

        ##############################################################################

        # ask user for directory name where the output csv files will be saved to
        dirname = tkFileDialog.askdirectory(initialdir="/",title="Choose Directory Location for Results")
        outputDirBase = dirname + '/'
        
        # csv result files will be located in teamNLP file followed by a number
        # if one or more exist already in the user directory location
        count = 1
        baseName = 'teamNLP'
        outputFileName = outputDirBase + baseName
        while (os.path.exists(outputFileName)): # while the directory name exists
            count += 1 # increment the counter...
            outputFileName = outputDirBase + baseName + str(count) 
        os.mkdir(outputFileName) # create folder in user's chosen directory location
        
        numpy.savetxt(outputFileName + '/the2DArray.csv', the2DArray, delimiter=",", fmt="%s")
        numpy.savetxt(outputFileName + '/distinctWordCountArray.csv', distinctWordCountArray, delimiter=",", fmt="%s")
        numpy.savetxt(outputFileName + '/tf_idfArray.csv', tf_idfArray, delimiter=",", fmt="%s")
Ejemplo n.º 15
0
#! /usr/bin/env python

from FileIo import *
from InputProcessor import *
from BlockProcessor import *
import nltk
from Switch import *
import sys
import os

if __name__ == "__main__":

    os.system('clear')
    input = "../input.txt"
    #input = "input2"
    fio = FileIo(input)
    ip = InputProcessor()
    bp = BlockProcessor()
    processInput = ip.processInput(fio.getFile())
    tokenized = ip.tokenize(processInput)
    pos = bp.posTagger(processInput)

    print "Original input text:"
    print "###################################################################################\n\n"
    fio.toString();

    print "\n###################################################################################\n\n"

    if (len(sys.argv) == 2):
        choice = str(sys.argv[1])	
    else:
    def __init__(self, data_set):

        # Create storage to hold traffic data
        self.data_set = data_set
        self.traffic_data = list()
        self.traffic_data_keys = None
        self.districts_with_traffic_info = list()

        # Sets for each of the environments
        unique_districts_with_traffic_info_1 = set()
        unique_districts_with_traffic_info_2 = set()

        # Initialize files list based on which environment is being worked on
        data_files_path = None
        data_files_list = None
        if self.data_set == FileIo.TRAINING_DATA_SET:
            data_files_path = TrafficLookup.TRAINING_DATA_TRAFFIC_FILES_PATH
            data_files_list = TrafficLookup.TRAINING_DATA_TRAFFIC_FILES
        else:
            data_files_path = TrafficLookup.TESTING_DATA_TRAFFIC_FILES_PATH
            data_files_list = TrafficLookup.TESTING_DATA_TRAFFIC_FILES

        # Fill lookup dictionary from data files
        for file_name in data_files_list:

            # Loop through the records and load the dictionary lookup
            for record in FileIo.get_text_file_contents(data_files_path +
                                                        file_name):

                traffic_record = list()
                record_tokens = record.split(FileIo.TAB_CHARACTER)

                # Add district to the list of districts with traffic info
                unique_districts_with_traffic_info_1.add(record_tokens[0])

                # Separate out district hash, traffic at the four congestion levels and timestamp
                traffic_record.append(record_tokens[0])
                traffic_record.\
                    append(int(record_tokens[1].split(TrafficLookup.TRAFFIC_LEVEL_AND_ROAD_SECTION_SEPARATOR)[1]))
                traffic_record.\
                    append(int(record_tokens[2].split(TrafficLookup.TRAFFIC_LEVEL_AND_ROAD_SECTION_SEPARATOR)[1]))
                traffic_record.\
                    append(int(record_tokens[3].split(TrafficLookup.TRAFFIC_LEVEL_AND_ROAD_SECTION_SEPARATOR)[1]))
                traffic_record.\
                    append(int(record_tokens[4].split(TrafficLookup.TRAFFIC_LEVEL_AND_ROAD_SECTION_SEPARATOR)[1]))
                traffic_record.\
                    append(time.mktime(time.strptime(record_tokens[5].strip(), FileIo.TIMESTAMP_FORMAT)))

                self.traffic_data.append(traffic_record)

        # Sort the traffic data so that searching on district hash and timestamp is possible
        self.traffic_data.sort(key=lambda x: (x[0], x[5]))
        self.traffic_data_keys = [[record[0], record[5]]
                                  for record in self.traffic_data]

        # Loop through the other data set and add districts to the to list of districts with traffic info
        if self.data_set == FileIo.TRAINING_DATA_SET:
            data_files_path = TrafficLookup.TESTING_DATA_TRAFFIC_FILES_PATH
            data_files_list = TrafficLookup.TESTING_DATA_TRAFFIC_FILES
        else:
            data_files_path = TrafficLookup.TRAINING_DATA_TRAFFIC_FILES_PATH
            data_files_list = TrafficLookup.TRAINING_DATA_TRAFFIC_FILES

        for file_name in data_files_list:

            # Loop through the records and add districts to the list of districts with traffic info
            for record in FileIo.get_text_file_contents(data_files_path +
                                                        file_name):
                record_tokens = record.split(FileIo.TAB_CHARACTER)
                unique_districts_with_traffic_info_2.add(record_tokens[0])

        # Save the districts with traffic info
        self.districts_with_traffic_info = list(
            unique_districts_with_traffic_info_1.intersection(
                unique_districts_with_traffic_info_2))
        unique_districts_with_traffic_info_1 = None
        unique_districts_with_traffic_info_2 = None