Example #1
0
 def __init__(self, context_n=3):
     self.featureVector = FeatureVector()
     self.phrase_dim = self.featureVector.dim()
     self.phr_cox_dim = self.phrase_dim * 4
 def __init__(self):
     self.__featureVector = FeatureVector()
Example #3
0
class Preprocessor():
    def __init__(self, context_n=3):
        self.featureVector = FeatureVector()
        self.phrase_dim = self.featureVector.dim()
        self.phr_cox_dim = self.phrase_dim * 4

    def getDim(self):
        return self.phrase_dim, self.phr_cox_dim

    '''
    Transforms words into vectors using FeatureVector and stacks vectors into matrix
    '''

    def concat(self, phrase, tags):
        phrase = phrase.split()
        ret_array = np.reshape(
            self.featureVector.vectorise(phrase[0], tags[0]), (1, -1))
        if len(phrase) > 1:
            for i in range(len(phrase)):
                ret_array = np.vstack([
                    ret_array,
                    self.featureVector.vectorise(phrase[i], tags[i])
                ])
        return ret_array

    '''
    Returns concatenated vector of maximal and minimal features for given phrase vectors and context vectors 
    '''

    def minMaxVector(self, phrase_vec, context_vec):
        phrase_max_min = np.concatenate(
            (phrase_vec.max(axis=0), phrase_vec.min(axis=0)))
        context_max_min = np.concatenate(
            (context_vec.max(axis=0), context_vec.min(axis=0)))
        vector = np.concatenate((phrase_max_min, context_max_min))
        return vector

    '''
    Main logic that for phrases, context and/or skills returns its vector values.
    '''

    def preprocess(self,
                   noun_phrases,
                   context,
                   np_tags,
                   context_tags,
                   skills=False):
        phrases_vec = []
        context_vec = []
        phr_cox_vec = []
        y = []

        if skills != False:
            skills = [x.lower() for x in skills]

        for i in range(len(noun_phrases)):
            current_phrase_vec = self.concat(noun_phrases[i], np_tags[i])
            phrases_vec.append(current_phrase_vec)
            current_context_vec = self.concat(context[i], context_tags[i])
            context_vec.append(current_context_vec)
            phr_cox_vec.append(
                self.minMaxVector(current_phrase_vec, current_context_vec))
            if skills != False:
                if noun_phrases[i].lower() in skills:
                    y.append(1)
                else:
                    y.append(0)

        if skills != False:
            return phrases_vec, context_vec, phr_cox_vec, y
        return phrases_vec, context_vec, phr_cox_vec
class ArffHandler:


    def __init__(self):
        self.__featureVector = FeatureVector()

    '''
    def generateArffFile(filename, data): Generate sparse Arff files.

        Inputs:

        Filename: Any name you like. A file named "filename".arff will be created.

        *** Data format must be ***
        data = {"$PageId":
            {
            "class":$X,
            "data":{$id1:$x1,$id2:$x2,$id3:$x3,$id4:$x4...}
            }
        }
        example:
        data = {5000:
            {
            "class":1,
            "data":{"medicine":400,"cars":3,"health":9999,"Metallica":10...}
            }
        }
    '''
    def generateArffFile(self, filename, data):

        self.__featureVector.createCountMap(data)

        file = open(filename + ".arff", "w")
        file.write("@RELATION " + filename + "\n")

        sortedList = self.__featureVector.getSortedVocabularyMap()

        #TWO FIRST ATTRIBUTES ARE RESERVED!
        file.write("@ATTRIBUTE CLASS {0,1}\n")
        file.write("@ATTRIBUTE CATEGORYNAME STRING\n")

        for attribute in sortedList:
            file.write("@ATTRIBUTE " + attribute[0] + "-"
                       + str(attribute[1]) + " NUMERIC\n")
        #  file.write("@ATTRIBUTE name NUMERIC")

        file.write("@DATA\n")
        for categoryName, innerMap in data.iteritems():
            catClass = innerMap["class"]
            file.write("{0 " + catClass)
            file.write(",1 " + categoryName)

            for k, v in innerMap["data"].iteritems():
                file.write("," + str(self.__featureVector.getIndex(k)) + " " + str(v))
            file.write("}\n")
        file.close()
        logging.info("Created file on local drive: %s.arff", filename)
        return filename + ".arff"

    def readArffFile(self, filename):
        featureVector = []  # list of maps
        featureNameList = []
        with open(filename, "r") as f:
            for line in f:

                if not line.strip().startswith('@') and len(line.strip()) > 0:
                # start of actual data
                    pairs = line.strip(" {}\n").split(',')
                    featureMap = {}
                    for attr in pairs:
                        (key, value) = attr.split(' ', 1)
                        featureMap[int(key)] = value.strip().lower()
                    
                    featureVector.append(featureMap)

                else:
                # feature definitions
                    if (not line.strip().lower().startswith('@data'))  and (not line.lower().startswith('@relation')):
                        attrName = line.strip().split()[1]
                        
                        if attrName.find('-') != -1:
                            (name, idx) = attrName.split('-')
                            featureNameList.append(name)
                            self.__featureVector.updateVocabularyMap(name, idx)
                            logging.debug("Updated %s with index %s", name, idx)
                        else:
                            featureNameList.append(attrName)

        logging.info("Read arff file. Extracted feature vector")
        return [featureNameList, featureVector]

    def generateFeatureVector(self, data, categoryName):
        return self.__featureVector.generateFeatureVector(data, categoryName)