Ejemplo n.º 1
0
def convertTitlesToPaddedSequences(dataDf):
    allTitles = getNonEmptyLines(titlesSummaryFilePath)
    allTitles.append(START_TOKEN)
    titleTokenizer = preproc.text.Tokenizer(num_words=TITLE_VOCAB_SIZE)
    titleTokenizer.fit_on_texts(allTitles)

    trainTitles = START_TOKEN + " " + dataDf[TITLE_LABEL].astype(str)
    trainTitleSequences = titleTokenizer.texts_to_sequences(trainTitles)
    paddedTrainTitleSequences = preproc.sequence.pad_sequences(
        trainTitleSequences,
        maxlen=MAX_TITLE_LEN,
        padding=PADDING_TYPE,
        truncating=TRUNCATING_TYPE)

    return paddedTrainTitleSequences
Ejemplo n.º 2
0
def convertLocationsToPaddedSequences(dataDf):
    allLocations = getNonEmptyLines(locationsSummaryFilePath)
    allLocations.append(START_TOKEN)
    locationTokenizer = preproc.text.Tokenizer(num_words=LOCATION_VOCAB_SIZE)
    locationTokenizer.fit_on_texts(allLocations)

    trainLocations = START_TOKEN + " " + dataDf[LOCATION_LABEL].astype(str)
    trainLocationSequences = locationTokenizer.texts_to_sequences(
        trainLocations)
    paddedTrainLocationSequences = preproc.sequence.pad_sequences(
        trainLocationSequences,
        maxlen=MAX_LOCATION_LEN,
        padding=PADDING_TYPE,
        truncating=TRUNCATING_TYPE)

    return paddedTrainLocationSequences
Ejemplo n.º 3
0
def convertBenefitsToPaddedSequences(dataDf):
    allBenefits = getNonEmptyLines(benefitsSummaryFilePath)
    allBenefits.append(START_TOKEN)
    benefitsTokenizer = preproc.text.Tokenizer(num_words=BENEFITS_VOCAB_SIZE)
    benefitsTokenizer.fit_on_texts(allBenefits)

    trainBenefits = START_TOKEN + " " + dataDf[BENEFITS_LABEL].astype(str)
    trainBenefitsSequences = benefitsTokenizer.texts_to_sequences(
        trainBenefits)
    paddedTrainBenefitsSequences = preproc.sequence.pad_sequences(
        trainBenefitsSequences,
        maxlen=MAX_BENEFITS_LEN,
        padding=PADDING_TYPE,
        truncating=TRUNCATING_TYPE)

    return paddedTrainBenefitsSequences
Ejemplo n.º 4
0
def convertDepartmentsToPaddedSequences(dataDf):
    allDepartments = getNonEmptyLines(departmentsSummaryFilePath)
    allDepartments.append(START_TOKEN)
    departmentTokenizer = preproc.text.Tokenizer(
        num_words=DEPARTMENT_VOCAB_SIZE)
    departmentTokenizer.fit_on_texts(allDepartments)

    trainDepartments = START_TOKEN + " " + dataDf[DEPARTMENT_LABEL].astype(str)
    trainDepartmentSequences = departmentTokenizer.texts_to_sequences(
        trainDepartments)
    paddedTrainDepartmentSequences = preproc.sequence.pad_sequences(
        trainDepartmentSequences,
        maxlen=MAX_DEPARTMENT_LEN,
        padding=PADDING_TYPE,
        truncating=TRUNCATING_TYPE)

    return paddedTrainDepartmentSequences