Esempio n. 1
0
 def _loadData(self):
     """ Load data, returning a dict of text data objects.
 Keys are line numbers at which the text appears in the CSV file.
 """
     return readCSV(
         self.dataPath,
         numLabels=0)  # 0 to train models in unsupervised fashion
def generateDataFile(inputData, outputDataDir, type):
    """
    Generates a samples data file with all of the words in the sample
    reversed.

    @param  (str)               Path to input original samples data file
    @param  (TextPreprocess)    Processor to perform some text cleanup.

    """
    if not os.path.exists(outputDataDir):
        os.makedirs(outputDataDir)

    fileName = string.join(inputData.split(".")[:-1], ".") + "_" + type + ".csv"
    dataDict = readCSV(inputData, numLabels=3)
    headers = ["QID", "QuestionText", "Response", "Classification1",
               "Classification2", "Classification3"]
    data = []
    for sample in dataDict.items():
      response = sample[1][0]
      tokens = response.split(" ")
      tokens = cleanTokens(tokens)

      response = None
      if type == "scrambled":
        random.shuffle(tokens)
        response = " ".join(tokens)
      elif type == "reversed":
        response = " ".join(tokens[::-1])

      dataToWrite = [sample[0], "", response]
      dataToWrite.extend(sample[1][1])
      data.append(dataToWrite)

    writeCSV(data, headers, os.path.join(outputDataDir, fileName))
    def split(self,
              filePath=None,
              numLabels=3,
              textPreprocess=False,
              dataDict=None,
              abbrCSV="",
              contrCSV="",
              ignoreCommon=100,
              removeStrings="[identifier deleted]",
              correctSpell=True):
        """
    Split all the comments in a file into tokens, w/ or w/o preprocessing.
    Specifying both filePath and dataDict will prefer filePath.

    @param filePath        (str)    Path to csv file
    @param dataDict        (dict)   Data as returned by readCSV()
    @param numLabels       (int)    Number of columns of category labels.
    @param textPreprocess  (bool)   True will preprocess text while tokenizing.
    
    @return dataDict       (dict)   Data as read in from filePath.

    Please see TextPreprocess tokenize() for the other parameters; they're only
    used when textPreprocess is True.
    """
        if filePath:
            dataDict = readCSV(filePath, numLabels=numLabels)

        if dataDict is None:
            raise Exception("No data given, or could not read CSV.")

        preprocessor = TextPreprocess(abbrCSV=abbrCSV, contrCSV=contrCSV)
        expandAbbr = (abbrCSV != "")
        expandContr = (contrCSV != "")

        for recordNum, record in dataDict.iteritems():
            comment, categories, uniqueID = record

            # Convert the categories to a string of their IDs
            categories = string.join(
                [str(self.categoryToId[c]) for c in categories])

            if textPreprocess:
                tokens, _ = preprocessor.tokenizeAndFilter(
                    comment, ignoreCommon, removeStrings, correctSpell,
                    expandAbbr, expandContr)
            else:
                tokens = preprocessor.tokenize(comment)

            data = self._formatSequence(tokens, categories, recordNum,
                                        uniqueID)

            self.records.append(data)
            self.sequenceCount += 1

        return dataDict
    def split(
        self,
        filePath=None,
        numLabels=3,
        textPreprocess=False,
        dataDict=None,
        abbrCSV="",
        contrCSV="",
        ignoreCommon=100,
        removeStrings="[identifier deleted]",
        correctSpell=True,
    ):
        """
    Split all the comments in a file into tokens, w/ or w/o preprocessing.
    Specifying both filePath and dataDict will prefer filePath.

    @param filePath        (str)    Path to csv file
    @param dataDict        (dict)   Data as returned by readCSV()
    @param numLabels       (int)    Number of columns of category labels.
    @param textPreprocess  (bool)   True will preprocess text while tokenizing.
    
    @return dataDict       (dict)   Data as read in from filePath.

    Please see TextPreprocess tokenize() for the other parameters; they're only
    used when textPreprocess is True.
    """
        if filePath:
            dataDict = readCSV(filePath, numLabels=numLabels)

        if dataDict is None:
            raise Exception("No data given, or could not read CSV.")

        preprocessor = TextPreprocess(abbrCSV=abbrCSV, contrCSV=contrCSV)
        expandAbbr = abbrCSV != ""
        expandContr = contrCSV != ""

        for recordNum, record in dataDict.iteritems():
            comment, categories, uniqueID = record

            # Convert the categories to a string of their IDs
            categories = string.join([str(self.categoryToId[c]) for c in categories])

            if textPreprocess:
                tokens, _ = preprocessor.tokenizeAndFilter(
                    comment, ignoreCommon, removeStrings, correctSpell, expandAbbr, expandContr
                )
            else:
                tokens = preprocessor.tokenize(comment)

            data = self._formatSequence(tokens, categories, recordNum, uniqueID)

            self.records.append(data)
            self.sequenceCount += 1

        return dataDict
Esempio n. 5
0
def run(args):

    if args.loadPath:
        model = loadModel(args.loadPath)

    elif args.modelName == "HTMNetwork":
        networkConfig = loadJSON(_NETWORK_JSON)

        print "Creating the network model..."
        model = _createModel(modelName=args.modelName,
                             savePath=args.savePath,
                             networkConfig=networkConfig,
                             inputFilePath=args.dataPath,
                             prepData=True,
                             numLabels=0,
                             stripCats=True,
                             retinaScaling=1.0)

        numRecords = sum(
            model.networkDataGen.getNumberOfTokens(model.networkDataPath))

        print "Training the model..."
        model.trainModel(
            iterations=numRecords)  # TODO: switch to using trainNetwork

    else:
        model = _createModel(modelName=args.modelName, savePath=args.savePath)

        dataDict = readCSV(args.dataPath, numLabels=0)

        print "Preparing and encoding the data..."
        samples = model.prepData(dataDict, args.preprocess)
        patterns = model.encodeSamples(samples)

        print "Training the model..."
        for i in xrange(len(samples)):
            model.trainModel(i)

    if args.savePath:
        model.saveModel()

    # Query the model.
    printTemplate = "{0:<10}|{1:<30}"
    while 1 < 2:
        print "Now we query the model for samples (quit with 'q')..."
        input = raw_input("Enter a query: ")
        if input == "q": break
        sortedDistances = model.queryModel(input, args.preprocess)
        print printTemplate.format("Sample ID", "Distance from query")
        for sID, dist in sortedDistances:
            print printTemplate.format(sID, dist)
    return
Esempio n. 6
0
def readData(args):
    """
  Read data file and print out some statistics
  Return a training set, test set, labelId to text map, and docId to categories
  map.

  Return format:
      trainingData = [
        ["fox eats carrots", [0], docId],
        ["fox eats peppers", [0], docId],
        ["carrots are healthy", [1], docId],
        ["peppers is healthy", [1], docId],
      ]
  """
    # Read data
    dataDict = readCSV(args.dataPath, 1)
    labelRefs, dataDict = mapLabelRefs(dataDict)
    categoriesInOrderOfInterest = [8, 9, 10, 5, 6, 11, 13][0 : args.numLabels]

    # Select data based on categories of interest. Shift category indices down
    # so we go from 0 to numLabels-1
    trainingData = []
    counts = numpy.zeros(len(labelRefs))
    for document in dataDict.itervalues():
        docId = document[2]
        oldCategoryIndex = document[1][0]
        if oldCategoryIndex in categoriesInOrderOfInterest:
            newIndex = categoriesInOrderOfInterest.index(oldCategoryIndex)
            trainingData.append([document[0], [newIndex], docId])
            counts[newIndex] += 1

    # For each document, figure out which categories it belongs to
    # Include the shifted category index
    documentCategoryMap = {}
    for doc in dataDict.iteritems():
        docId = doc[1][2]
        oldCategoryIndex = doc[1][1][0]
        if oldCategoryIndex in categoriesInOrderOfInterest:
            newIndex = categoriesInOrderOfInterest.index(oldCategoryIndex)
            v = documentCategoryMap.get(docId, [])
            v.append(newIndex)
            documentCategoryMap[docId] = v

    labelRefs = [labelRefs[i] for i in categoriesInOrderOfInterest]
    print "Total number of unique documents", len(documentCategoryMap)
    print "Category counts: ", counts
    print "Categories in training/test data:", labelRefs

    return trainingData, trainingData, labelRefs, documentCategoryMap
Esempio n. 7
0
  def __init__(self, dataPath="data.csv"):
    """
    initializes imbu model with given sample data

    :param str dataPath: Path to sample data file.
                         Must be a CSV file having 'ID and 'Sample' columns
    """
    g_log.info("Initialize imbu model")

    csvdata = readCSV(dataPath, numLabels=0)
    self.samples = OrderedDict()
    for dataID, text in csvdata.iteritems():
      self.samples[dataID] = text


    self.models = {modelName: createModel(modelName, dataPath, csvdata)
      for modelName, modelFactory in _MODEL_MAPPING.iteritems()}
Esempio n. 8
0
def run(args):

  if args.loadPath:
    model = loadModel(args.loadPath)
  
  elif args.modelName == "HTMNetwork":
    networkConfig = loadJSON(_NETWORK_JSON)
    
    print "Creating the network model..."
    model = _createModel(modelName=args.modelName, savePath=args.savePath,
      networkConfig=networkConfig, inputFilePath=args.dataPath, prepData=True,
      numLabels=0, stripCats=True, retinaScaling=1.0)

    numRecords = sum(
      model.networkDataGen.getNumberOfTokens(model.networkDataPath))

    print "Training the model..."
    model.trainModel(iterations=numRecords)  # TODO: switch to using trainNetwork

  else:
    model = _createModel(modelName=args.modelName, savePath=args.savePath)

    dataDict = readCSV(args.dataPath, numLabels=0)

    print "Preparing and encoding the data..."
    samples = model.prepData(dataDict, args.preprocess)
    patterns = model.encodeSamples(samples)

    print "Training the model..."
    for i in xrange(len(samples)):
      model.trainModel(i)

  if args.savePath:
    model.saveModel()

  # Query the model.
  printTemplate = "{0:<10}|{1:<30}"
  while 1<2:
    print "Now we query the model for samples (quit with 'q')..."
    input = raw_input("Enter a query: ")
    if input == "q": break
    sortedDistances = model.queryModel(input, args.preprocess)
    print printTemplate.format("Sample ID", "Distance from query")
    for sID, dist in sortedDistances:
      print printTemplate.format(sID, dist)
  return
Esempio n. 9
0
    def __init__(self, dataPath="data.csv"):
        """
    initializes imbu model with given sample data

    :param str dataPath: Path to sample data file.
                         Must be a CSV file having 'ID and 'Sample' columns
    """
        g_log.info("Initialize imbu model")

        csvdata = readCSV(dataPath, numLabels=0)
        self.samples = OrderedDict()
        for dataID, text in csvdata.iteritems():
            self.samples[dataID] = text

        self.models = {
            modelName: createModel(modelName, dataPath, csvdata)
            for modelName, modelFactory in _MODEL_MAPPING.iteritems()
        }
Esempio n. 10
0
  def validateExperiment(self, expectationFilePath):
    """Returns accuracy of predicted labels against expected labels."""
    dataDict = readCSV(expectationFilePath, numLabels=self.numClasses)

    accuracies = numpy.zeros((len(self.results)))
    for i, trial in enumerate(self.results):
      for j, predictionList in enumerate(trial[0]):
        predictions = [self.labelRefs[p] for p in predictionList]
        if predictions == []:
          predictions = ["(none)"]
        expected = dataDict.items()[j+self.trainSizes[i]][1]

        accuracies[i] += (float(len(set(predictions) & set(expected[1])))
                          / len(expected[1]))

      accuracies[i] = accuracies[i] / len(trial[0])

    return accuracies
  def getExpectedClassifications(runner, expectationFilePath):
    """
    Return a list of the labels predicted by runner and a list of expected
    labels from the expected classifications file path.
    """
    dataDict = readCSV(expectationFilePath, numLabels=3)

    expectedClasses = []
    resultClasses = []
    for trial, trialResults in enumerate(runner.results):
      for i, predictionList in enumerate(trialResults[0]):
        predictions = [runner.labelRefs[p] for p in predictionList]
        if predictions == []:
          predictions = ["(none)"]
        resultClasses.append(predictions)
        expectedClasses.append(dataDict.items()[i+runner.trainSizes[trial]][1][1])

    return expectedClasses, resultClasses
Esempio n. 12
0
    def validateExperiment(self, expectationFilePath):
        """Returns accuracy of predicted labels against expected labels."""
        dataDict = readCSV(expectationFilePath, numLabels=self.numClasses)

        accuracies = numpy.zeros((len(self.results)))
        for i, trial in enumerate(self.results):
            for j, predictionList in enumerate(trial[0]):
                predictions = [self.labelRefs[p] for p in predictionList]
                if predictions == []:
                    predictions = ["(none)"]
                expected = dataDict.items()[j + self.trainSizes[i]][1]

                accuracies[i] += (
                    float(len(set(predictions) & set(expected[1]))) /
                    len(expected[1]))

            accuracies[i] = accuracies[i] / len(trial[0])

        return accuracies
Esempio n. 13
0
    def getExpectedClassifications(runner, expectationFilePath):
        """
    Return a list of the labels predicted by runner and a list of expected
    labels from the expected classifications file path.
    """
        dataDict = readCSV(expectationFilePath, numLabels=3)

        expectedClasses = []
        resultClasses = []
        for trial, trialResults in enumerate(runner.results):
            for i, predictionList in enumerate(trialResults[0]):
                predictions = [runner.labelRefs[p] for p in predictionList]
                if predictions == []:
                    predictions = ["(none)"]
                resultClasses.append(predictions)
                expectedClasses.append(
                    dataDict.items()[i + runner.trainSizes[trial]][1][1])

        return expectedClasses, resultClasses
Esempio n. 14
0
  def setupData(self, preprocess=False):
    """
    Get the data from CSV and preprocess if specified. The call to readCSV()
    assumes a specific CSV format, detailed in its docstring.

    @param preprocess   (bool)    Whether or not to preprocess the data when
                                  reading in samples.
    """
    self.dataDict = readCSV(self.dataPath, numLabels=self.numClasses)

    if self.experimentType == "incremental":
      # stop now if the data won't work for the specified experiment
      if (not isinstance(self.trainSizes, list) or not
          all([0 <= size <= len(self.dataDict) for size in self.trainSizes])):
        raise ValueError("Invalid size(s) for training set(s).")

    self.labelRefs, self.dataDict = mapLabelRefs(self.dataDict)

    self.samples = self.model.prepData(self.dataDict, preprocess)

    if self.verbosity > 1:
      for i, s in self.samples.iteritems():
        print i, s
Esempio n. 15
0
def generateDataFile(inputData, outputDataDir, type):
    """
    Generates a samples data file with all of the words in the sample
    reversed.

    @param  (str)               Path to input original samples data file
    @param  (TextPreprocess)    Processor to perform some text cleanup.

    """
    if not os.path.exists(outputDataDir):
        os.makedirs(outputDataDir)

    fileName = string.join(inputData.split(".")[:-1],
                           ".") + "_" + type + ".csv"
    dataDict = readCSV(inputData, numLabels=3)
    headers = [
        "QID", "QuestionText", "Response", "Classification1",
        "Classification2", "Classification3"
    ]
    data = []
    for sample in dataDict.items():
        response = sample[1][0]
        tokens = response.split(" ")
        tokens = cleanTokens(tokens)

        response = None
        if type == "scrambled":
            random.shuffle(tokens)
            response = " ".join(tokens)
        elif type == "reversed":
            response = " ".join(tokens[::-1])

        dataToWrite = [sample[0], "", response]
        dataToWrite.extend(sample[1][1])
        data.append(dataToWrite)

    writeCSV(data, headers, os.path.join(outputDataDir, fileName))
Esempio n. 16
0
    def setupData(self, preprocess=False):
        """
    Get the data from CSV and preprocess if specified. The call to readCSV()
    assumes a specific CSV format, detailed in its docstring.

    @param preprocess   (bool)    Whether or not to preprocess the data when
                                  reading in samples.
    """
        self.dataDict = readCSV(self.dataPath, numLabels=self.numClasses)

        if self.experimentType == "incremental":
            # stop now if the data won't work for the specified experiment
            if (not isinstance(self.trainSizes, list) or not all(
                [0 <= size <= len(self.dataDict)
                 for size in self.trainSizes])):
                raise ValueError("Invalid size(s) for training set(s).")

        self._mapLabelRefs()

        self.samples = self.model.prepData(self.dataDict, preprocess)

        if self.verbosity > 1:
            for i, s in self.samples.iteritems():
                print i, s
Esempio n. 17
0
  try:
    with pkg_resources.resource_filename(__name__, jsonPath) as f:
      return json.load(f)
  except IOError as e:
    print "Could not find JSON at '{}'.".format(jsonPath)
    raise e


# Indicates global ready status of all models.  g_ready will transition to
# True when all models have been created, trained, and are ready to handle
# requests
g_ready = False
g_models = {}
g_csvdata = (
  readCSV(
    os.getenv("IMBU_DATA",
              pkg_resources.resource_filename(__name__, "data.csv")),
  numLabels=0)
)

# Get data and order by unique ID
g_samples = OrderedDict(
  (sample[2], sample[0]) for sample in g_csvdata.values()
)



def createModel(modelName, modelFactory):
  """Return an instantiated model."""

  global g_models
Esempio n. 18
0
 def _loadData(self):
   """ Load data, returning a dict of text data objects.
   Keys are line numbers at which the text appears in the CSV file.
   """
   return readCSV(self.dataPath,
                  numLabels=0) # 0 to train models in unsupervised fashion
Esempio n. 19
0
 def _loadData(self):
   """ Load data.
   """
   return readCSV(self.dataPath,
                  numLabels=0) # 0 to train models in unsupervised fashion
Esempio n. 20
0
        with pkg_resources.resource_filename(__name__, jsonPath) as f:
            return json.load(f)
    except IOError as e:
        print "Could not find JSON at '{}'.".format(jsonPath)
        raise e


# Indicates global ready status of all models.  g_ready will transition to
# True when all models have been created, trained, and are ready to handle
# requests
g_ready = False

g_models = {}

# Get data and order by unique ID
g_csvdata = (readCSV(_DATA_PATH, numLabels=0))
g_samples = OrderedDict(
    (int(sample[2]), sample[0]) for sample in g_csvdata.values())


def createModel(modelName, modelFactory):
    """Return an instantiated model."""

    global g_models

    modelDir = os.path.join(_MODEL_CACHE_DIR_PREFIX, modelName)

    try:
        print "Attempting to load from", modelDir
        model = ClassificationModel.loadModel(modelDir)
        modelProxy = SynchronousBackgroundModelProxy(model)
Esempio n. 21
0
 def _loadData(self):
     """ Load data.
 """
     return readCSV(
         self.dataPath,
         numLabels=0)  # 0 to train models in unsupervised fashion
Esempio n. 22
0
      return json.load(f)
  except IOError as e:
    print "Could not find JSON at '{}'.".format(jsonPath)
    raise e


# Indicates global ready status of all models.  g_ready will transition to
# True when all models have been created, trained, and are ready to handle
# requests
g_ready = False

g_models = {}

# Get data and order by unique ID
g_csvdata = (
  readCSV(_DATA_PATH, numLabels=0)
)
g_samples = OrderedDict(
  (int(sample[2]), sample[0]) for sample in g_csvdata.values()
)



def createModel(modelName, modelFactory):
  """Return an instantiated model."""

  global g_models

  modelDir = os.path.join(_MODEL_CACHE_DIR_PREFIX, modelName)

  try:
Esempio n. 23
0
  try:
    with pkg_resources.resource_filename(__name__, jsonPath) as fin:
      return json.load(fin)
  except IOError as e:
    print "Could not find JSON at '{}'.".format(jsonPath)
    raise e


# Indicates global ready status of all models.  g_ready will transition to
# True when all models have been created, trained, and are ready to handle
# requests
g_ready = False
g_models = {}
g_csvdata = (
  readCSV(
    os.getenv("IMBU_DATA",
              pkg_resources.resource_filename(__name__, "data.csv")),
  numLabels=0)
)

# Get data and order by unique ID
g_samples = OrderedDict(
  (sample[2], sample[0]) for sample in g_csvdata.values()
)



def createModel(modelName, modelFactory):
  """Return an instantiated model."""

  global g_models