def run(): try: print "starting to build LSI Model" start = datetime.now() documents = Feature.objects.exclude(text=None).values_list("text", flat=True) number_of_documents = len(documents) print "number_of_documents:", number_of_documents stopwords = [] stopwords += [month.lower() for month in month_to_number.keys()] stopwords += nltk_stopwords.words('english') print "stopwords:", len(stopwords) with open(path_to_directory_of_this_file + "/stopwords.txt") as f: stopwords.extend([word for word in f.read().decode("utf-8").split("\n") if word and not word.startswith("#")]) stopwords = set(stopwords) texts = [[word for word in document.lower().replace("#"," ").replace("_"," ").replace("("," ").replace(")"," ").replace("/"," ").replace(":"," ").replace("."," ").split() if word not in stopwords and len(word) > 3 ] for document in documents] counter = Counter() for text in texts: counter.update(text) texts = [[token for token in text if counter[token] > 1] for text in texts] dictionary = Dictionary(texts) print "dictionary:", dictionary dictionary.save(path_to_directory_of_this_file + "/dictionary") corpus = [dictionary.doc2bow(text) for text in texts] print "corpus:", type(corpus) print "generating lsi model" lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=10) print "saving LSI model" lsi.save(path_to_directory_of_this_file + "/model") Topic.objects.all().delete() topics = [] for topic in lsi.show_topics(): topics.append(Topic(id=topic[0], name=prettify_topic(topic[1]))) Topic.objects.bulk_create(topics) except Exception as e: print e
from appfd.models import Feature, Place, Topic from appfd.scripts.ai.lsi.get_topic import run as get_topic from appfd.scripts.ai.lsi.build import run as build from collections import Counter from date_extractor import month_to_number from datetime import datetime stopwords = [month.lower() for month in list(month_to_number.keys())] def run(places=None): try: start = datetime.now() if places is None: # will probably have to do a yield thing at some point #places = Place.objects.exclude(featureplace=None, featureplace__correct=True, featureplace__feature__verified=True) places = Place.objects.exclude(featureplace=None) place_ids = Feature.objects.filter( featureplace__correct=True, featureplace__feature__verified=True).values_list( "featureplace__place_id", flat=True) print("place_ids:", len(place_ids)) for place_id in place_ids: counter = Counter()
from appfd.models import Feature, Place, Topic from appfd.scripts.ai.lsi.get_topic import run as get_topic from appfd.scripts.ai.lsi.build import run as build from collections import Counter from date_extractor import month_to_number from datetime import datetime stopwords = [month.lower() for month in month_to_number.keys()] def run(places=None): try: start = datetime.now() if places is None: # will probably have to do a yield thing at some point #places = Place.objects.exclude(featureplace=None, featureplace__correct=True, featureplace__feature__verified=True) places = Place.objects.exclude(featureplace=None) place_ids = Feature.objects.filter(featureplace__correct=True, featureplace__feature__verified=True).values_list("featureplace__place_id", flat=True) print "place_ids:", len(place_ids) for place_id in place_ids: counter = Counter() for feature in Feature.objects.filter(verified=True, featureplace__place_id=place_id, featureplace__correct=True).exclude(text=None).exclude(text=""): if feature.text: topic_id = get_topic(feature.text) if topic_id: