def load_bigrams(): tokens, vocab = read_data('Dataset2.txt') tokens = preprocess_text(tokens) tr_tokens = tokens[math.floor(len(tokens) / 2):] te_tokens = tokens[:math.floor(len(tokens) / 2)] # get the bigrams by sentence and the corresponding vocabulary tr_bigrms, vocab2 = find_bigrams(tr_tokens) te_bigrms, vocab2 = find_bigrams(te_tokens) # find frequency of bigrams and add to the data trY, trX = find_cumFrequency(tr_tokens, tr_bigrms, vocab, vocab2) teY, teX = find_cumFrequency(te_tokens, te_bigrms, vocab, vocab2) return trX, teX, trY, teY
def load_unigrams(): # tokenize data by sentences tokens, vocab = read_data('Dataset2.txt') # clean up data tokens = preprocess_text(tokens) # split into training and test sets tr_tokens = tokens[math.floor(len(tokens) / 2):] te_tokens = tokens[:math.floor(len(tokens) / 2)] # find frequency: positive: 0; negative: 1 trY, trX = find_frequency(tr_tokens, vocab) teY, teX = find_frequency(te_tokens, vocab) return trX, teX, trY, teY
def main(): data = data_utilities.read_data('course_data.json') formatted_data = format_data(data) data_utilities.write_data(formatted_data, 'parse_formatted_course_data.json')
titles.append(class_datum.get('full_title')) completer = MyCompleter(titles) top = completer.complete(query, range(5)) print top class MyCompleter(object): # Custom completer def __init__(self, options): self.options = sorted(options) def complete(self, text, states): if text: # cache matches (entries that start with entered text) self.matches = [s for s in self.options if s and s.startswith(text)] # return match indexed by state try: complete = [] print len(self.matches) for state in states: complete.append(self.matches[state]) return complete except IndexError as e: print e return None if __name__ == '__main__': class_data = data_utilities.read_data('all_class_data.json')['results'] query = raw_input("Class query: ") parse_query_search(class_data, query)