def split(self, filePath=None, numLabels=3, textPreprocess=False, dataDict=None,
            abbrCSV="", contrCSV="", ignoreCommon=100,
            removeStrings="[identifier deleted]", correctSpell=True):
    """
    Split all the comments in a file into tokens, w/ or w/o preprocessing.
    Specifying both filePath and dataDict will prefer filePath.

    @param filePath        (str)    Path to csv file
    @param dataDict        (dict)   Data as returned by readCSV()
    @param numLabels       (int)    Number of columns of category labels.
    @param textPreprocess  (bool)   True will preprocess text while tokenizing.
    
    @return dataDict       (dict)   Data as read in from filePath.

    Please see TextPreprocess tokenize() for the other parameters; they're only
    used when textPreprocess is True.
    """
    if filePath:
      dataDict = readCSV(filePath, numLabels=numLabels)

    if dataDict is None:
      raise Exception("No data given, or could not read CSV.")

    preprocessor = TextPreprocess(abbrCSV=abbrCSV, contrCSV=contrCSV)
    expandAbbr = (abbrCSV != "")
    expandContr = (contrCSV != "")

    for recordNum, record in dataDict.iteritems():
      comment, categories, uniqueID = record
      
      # Convert the categories to a string of their IDs
      categories = string.join([str(self.categoryToId[c]) for c in categories])

      if textPreprocess:
        tokens = preprocessor.tokenize(
            comment, ignoreCommon, removeStrings, correctSpell, expandAbbr,
            expandContr)
      else:
        tokens = preprocessor.tokenize(comment)

      data = self._formatSequence(tokens, categories, recordNum, uniqueID)

      self.records.append(data)
      self.sequenceCount += 1
    
    return dataDict
Exemple #2
0
  def split(self, filePath=None, numLabels=3, textPreprocess=False, dataDict=None,
            abbrCSV="", contrCSV="", ignoreCommon=100,
            removeStrings="[identifier deleted]", correctSpell=True):
    """
    Split all the comments in a file into tokens, w/ or w/o preprocessing.
    Specifying both filePath and dataDict will prefer filePath.

    @param filePath        (str)    Path to csv file
    @param dataDict        (dict)   Data as returned by readCSV()
    @param numLabels       (int)    Number of columns of category labels.
    @param textPreprocess  (bool)   True will preprocess text while tokenizing.
    
    @return dataDict       (dict)   Data as read in from filePath.

    Please see TextPreprocess tokenize() for the other parameters; they're only
    used when textPreprocess is True.
    """
    if filePath:
      dataDict = readCSV(filePath, numLabels=numLabels)

    if dataDict is None:
      raise Exception("No data given, or could not read CSV.")

    preprocessor = TextPreprocess(abbrCSV=abbrCSV, contrCSV=contrCSV)
    expandAbbr = (abbrCSV != "")
    expandContr = (contrCSV != "")

    for recordNum, record in dataDict.iteritems():
      comment, categories, uniqueID = record
      
      # Convert the categories to a string of their IDs
      categories = string.join([str(self.categoryToId[c]) for c in categories])

      if textPreprocess:
        tokens = preprocessor.tokenize(
            comment, ignoreCommon, removeStrings, correctSpell, expandAbbr,
            expandContr)
      else:
        tokens = preprocessor.tokenize(comment)

      data = self._formatSequence(tokens, categories, recordNum, uniqueID)

      self.records.append(data)
      self.sequenceCount += 1
    
    return dataDict
  def testTokenizeExpandContraction(self):
    """Tests contractions are expanded."""
    text = "I can't work at [identifier deleted] if you don't allw me to wfh"
    processor = TextPreprocess()

    expected_tokens = ["i", "can", "not", "work", "at", "identifier", "deleted",
                       "if", "you", "do", "not", "allw", "me", "to", "wfh"]
    tokens = processor.tokenize(text, expandContr=True)
    self.assertSequenceEqual(tokens, expected_tokens)
  def testTokenizeRemoveString(self):
    """Tests a provided string is ignored."""
    text = "I can't work at [identifier deleted] if you don't allw me to wfh"
    processor = TextPreprocess()

    expected_tokens = ["i", "can", "t", "work", "at", "if", "you", "don",
                       "t", "allw", "me", "to", "wfh"]
    tokens = processor.tokenize(text, removeStrings=["[identifier deleted]"])
    self.assertSequenceEqual(tokens, expected_tokens)
  def testTokenizeNoPreprocess(self):
    """Tests none of the preprocessing methods are used."""
    text = "I can't work at [identifier deleted] if you don't allw me to wfh"
    processor = TextPreprocess()

    expected_tokens = ["i", "can", "t", "work", "at", "identifier", "deleted",
                       "if", "you", "don", "t", "allw", "me", "to", "wfh"]
    tokens = processor.tokenize(text)
    self.assertSequenceEqual(tokens, expected_tokens)
Exemple #6
0
def trainModelWithText(model, trainingData):
  """ Train the given model on trainingData.
  This is (essentially) the same training method as in the research repo's
  imbu_runner.py.
  """
  textPreprocessor = TextPreprocess()
  for seqId, (text, _, _) in enumerate(trainingData.values()):
    textTokens = textPreprocessor.tokenize(text)  # TODO: use model's tokenization method instead
    lastToken = len(textTokens) - 1
    for i, token in enumerate(textTokens):
      # use the sequence's ID as the category label
      model.trainText(token,
                      [seqId],
                      sequenceId=seqId,
                      reset=int(i==lastToken))
Exemple #7
0
def trainModelWithText(model, trainingData):
    """ Train the given model on trainingData.
  This is (essentially) the same training method as in the research repo's
  imbu_runner.py.
  """
    textPreprocessor = TextPreprocess()
    for seqId, (text, _, _) in enumerate(trainingData.values()):
        textTokens = textPreprocessor.tokenize(
            text)  # TODO: use model's tokenization method instead
        lastToken = len(textTokens) - 1
        for i, token in enumerate(textTokens):
            # use the sequence's ID as the category label
            model.trainText(token, [seqId],
                            sequenceId=seqId,
                            reset=int(i == lastToken))
 def testFunctionsWithoutDataFiles(self):
   """
   Ensures a TextPreprocess object can be created and tokenize when there are
   no text data files (corpus text, abbreviations, and contractions).
   """
   text = "I can't work at [identifier deleted] if you don't allw me to wfh"
   processor = TextPreprocess(corpusTxt="fake.txt",
                              abbrCSV="not_here.csv",
                              contrCSV="not_real.csv")
   
   tokens = processor.tokenize(text)
   expected_tokens = ["i", "can", "t", "work", "at", "identifier", "deleted",
                      "if", "you", "don", "t", "allw", "me", "to", "wfh"]
   
   self.assertSequenceEqual(tokens, expected_tokens)