Ejemplo n.º 1
0
def load_bigrams():
    tokens, vocab = read_data('Dataset2.txt')
    tokens = preprocess_text(tokens)

    tr_tokens = tokens[math.floor(len(tokens) / 2):]
    te_tokens = tokens[:math.floor(len(tokens) / 2)]

    # get the bigrams by sentence and the corresponding vocabulary
    tr_bigrms, vocab2 = find_bigrams(tr_tokens)
    te_bigrms, vocab2 = find_bigrams(te_tokens)

    # find frequency of bigrams and add to the data
    trY, trX = find_cumFrequency(tr_tokens, tr_bigrms, vocab, vocab2)
    teY, teX = find_cumFrequency(te_tokens, te_bigrms, vocab, vocab2)

    return trX, teX, trY, teY
Ejemplo n.º 2
0
def load_unigrams():
    # tokenize data by sentences
    tokens, vocab = read_data('Dataset2.txt')

    # clean up data
    tokens = preprocess_text(tokens)

    # split into training and test sets
    tr_tokens = tokens[math.floor(len(tokens) / 2):]
    te_tokens = tokens[:math.floor(len(tokens) / 2)]

    # find frequency: positive: 0; negative: 1
    trY, trX = find_frequency(tr_tokens, vocab)
    teY, teX = find_frequency(te_tokens, vocab)

    return trX, teX, trY, teY
Ejemplo n.º 3
0
def main():
	data = data_utilities.read_data('course_data.json')
	formatted_data = format_data(data)
	data_utilities.write_data(formatted_data, 'parse_formatted_course_data.json')
Ejemplo n.º 4
0
		titles.append(class_datum.get('full_title'))
	completer = MyCompleter(titles)
	top = completer.complete(query, range(5))
	print top

class MyCompleter(object):  # Custom completer

	def __init__(self, options):
		self.options = sorted(options)

	def complete(self, text, states):
		if text:  # cache matches (entries that start with entered text)
			self.matches = [s for s in self.options 
								if s and s.startswith(text)]

		# return match indexed by state
		try: 
			complete = []
			print len(self.matches)
			for state in states:
				complete.append(self.matches[state])
			return complete
		except IndexError as e:
			print e
			return None


if __name__ == '__main__':
	class_data = data_utilities.read_data('all_class_data.json')['results']
	query = raw_input("Class query: ")
	parse_query_search(class_data, query)