def wpre(self): x = DataParser('weatherNYC.csv') sum = 0 for i in range(360): if (x.dates[i].isocalendar()[1] == self.week): sum = sum + x.precipations[i] return sum
def mavg(self): sum = 0 counter = 0 x = DataParser('weatherNYC.csv') for i in range(len(x.dates)): if (x.dates[i].month == self.month): sum = sum + x.avgs[i] counter = counter + 1 return sum / counter
def mmax(self): mmax = -500 x = DataParser('weatherNYC.csv') dates = x.dates maxs = x.maxs for i in range(len(dates)): if (dates[i].month == self.month): if (maxs[i] > mmax): mmax = maxs[i] return mmax
def mmin(self): mmin = 1000 x = DataParser('weatherNYC.csv') dates = x.dates mins = x.mins for i in range(len(dates)): if (dates[i].month == self.month): if (mins[i] < mmin): mmin = mins[i] return mmin
def main(): _initWordDic() # parse the data using dataParser parser = DataParser() docs, summary = parser.parseFile() p_doc = Preparer(docs) p_summary = Preparer(summary, is_summary=True) p_doc.cutDocs() p_summary.cutDocs() docLens = p_doc.countDocs() sumLens = p_summary.countDocs() print(max(sumLens)) #sys.exit() p_doc.doc2Int() p_summary.doc2Int() # docs, docLens, summary, sumLens are the data data = list(zip(docs, summary, docLens, sumLens)) training_data = data[:1585] validation_data = data[:1835] testing_data = data[1835:] ''' FIXING THE DIMENSION ISSUES OF BATCHES sf_train = SF(training_data, CONFIG.BATCH_SIZE, is_training = True) sf_valid = SF(validation_data, CONFIG.BATCH_SIZE, is_training = False) for tup in sf_train.get_batch(): _, doc, summary, docLens, sumLens = tup doc_batch = _get_doc_batch(doc) summary_batch = _get_summary_batch(summary) label_batch = _get_label_batch(summary) docLens = np.array(docLens) summaryLens = np.array(sumLens) print (doc_batch[0]) print (summary_batch[0]) print (label_batch[0]) print (list(doc for doc in docLens)) print (list(doc for doc in summaryLens)) sys.exit()''' with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-1, 1) with tf.name_scope('Train'): with tf.variable_scope('Model', reuse=None, initializer=initializer): m = SummaryModel(is_training=True) with tf.name_scope('Valid'): with tf.variable_scope('Model', reuse=True, initializer=initializer): m_valid = SummaryModel(is_training=False) with tf.name_scope('Test'): with tf.variable_scope('Model', reuse=True, initializer=initializer): m_test = SummaryModel(is_training=False) init_op = tf.global_variables_initializer() config = tf.ConfigProto() config.gpu_options.visible_device_list = '7' sess = tf.Session(config=config) sess.run(init_op) for epoch in range(CONFIG.EPOCH): print('---------------running ' + str(epoch) + 'th epoch ----------------') run_epoch(sess, m, m_valid, training_data, validation_data)
def main(): db = initializeMongoDB() tweetsTrainingCollection = db["tweetsTrainData"] fetchDataFrom= "Kaggle" data = [] tracemalloc.start() getMemoryUsage() if (fetchDataFrom == "db"): tokensForModel = DataParser.parseTweets(True, NewDataRepository.getData(),"") NewDataRepository.saveTrainingAndTestData(tokensForModel,db,"False") positiveTrainingTweets = NewDataRepository.getPositiveDBTweets(tweetsTrainingCollection) negativeTrainingTweets = NewDataRepository.getNegativeDBTweets(tweetsTrainingCollection) data = DataParser.prepareDataForTraining(True,positiveTrainingTweets,negativeTrainingTweets) if (fetchDataFrom == "twitter_samples" ): #positive_tweets = twitter_samples.strings('positive_tweets.json') #negative_tweets = twitter_samples.strings('negative_tweets.json') positiveTrainingTweets = DataParser.parseTweets(False,SampleRepository.getPositiveTweets(),"positive") negativeTrainingTweets = DataParser.parseTweets(False,SampleRepository.getNegativeTweets(),"negative") data = DataParser.prepareDataForTraining(False,positiveTrainingTweets,negativeTrainingTweets) if (fetchDataFrom == "Kaggle"): tweets = KaggleRepository.getExtensiveCSVTweetsForTraining() negativeTweets = [] poistiveTweets = [] for tweet in tweets: if (tweet[0] == 0): negativeTweets.append(tweet[1]) if (tweet[0] == 4): poistiveTweets.append(tweet[1]) del tweets negativeTrainingTweets = DataParser.parseTweets(True,negativeTweets[:10000],"negative") positiveTrainingTweets = DataParser.parseTweets(True,poistiveTweets[:10000],"positive") getMemoryUsage() del negativeTweets del poistiveTweets #gc.collect() getMemoryUsage() data = DataParser.prepareDataForTraining(False,positiveTrainingTweets,negativeTrainingTweets) del positiveTrainingTweets del negativeTrainingTweets getMemoryUsage() percentageOfTrainingData = 0.7 trainingData = data[:(int(len(data)*percentageOfTrainingData))] testData = data[(int(len(data)*percentageOfTrainingData)):] del data getMemoryUsage() #KaggleRepository.saveTweetsInFile(trainingData,"Positive") #KaggleRepository.saveTweetsInFile(trainingData,"Negative") bayesClassifier = BayesClassifier(0) bayesClassifier.train(trainingData) bayesClassifier.setAccuracy(testData) print("accuracy : " , bayesClassifier.getAccuracy()) #del trainingData #del testData #getMemoryUsage() #gc.collect #customTweet = 'Thank you for sending my baggage to CityX and flying me to CityY at the same time... Brilliant service. #thanksGenericAirline' customTweet = 'With this said, I think we are going to the moon' customTokens = DataParser.removeNoise(customTweet,stopwords.words('english')) print(bayesClassifier.avalueTweet(customTokens)) customTweet = 'With this said, I think we are going all the way down' customTokens = DataParser.removeNoise(customTweet,stopwords.words('english')) print(bayesClassifier.avalueTweet(customTokens))
def buildIndex( iterations, reset=True, resetFiles=True, passwordLock=True, dev=False, options={ 'crawl': True, 'pageRank': True, 'parse': True, 'database': True, 'idf': True, 'tfidf': True }): log('build index', 'Running full suite of crawler programs.') programStartTime = time.time() loginSuccess = False if reset and passwordLock: log("info", "You are about to reset the database") pwd = getpass('Enter password to continue:').encode('UTF-8') if (bcrypt.checkpw(pwd, loginPwd)): loginSuccess = True log('login', 'Login successful. Resetting databases.') else: log('login', 'Login failed. Reset operation not performed') else: loginSuccess = True if resetFiles and exists('domains'): log('cleanup', 'Removing old domains folder') rmtree('./domains') reset and loginSuccess and DatabaseBuilder.resetInvertedIndex( ) and DatabaseBuilder.resetCrawler() for domain in domains: domainStartTime = time.time() if options['crawl']: crawler = Crawler(domain['name'], domain['root']) crawler.runSpider(iterations) inlinkGraphFile = 'domains/' + domain['name'] + '/' + domain[ 'name'] + '_inlinks.json' outlinkGraphFile = 'domains/' + domain['name'] + '/' + domain[ 'name'] + '_outlinks.json' options['pageRank'] and calculatePageRank( domain['name'], inlinkGraphFile, outlinkGraphFile, 3) if options['parse']: dataParser = DataParser(domain['name']) dataParser.runParser() if options['database']: databaseBuilder = DatabaseBuilder(domain['name'], mode='DEV' if dev else 'PROD') databaseBuilder.build() log( "time", domain['name'] + " finished running in " + str(time.time() - domainStartTime) + " seconds.") options['idf'] and DatabaseBuilder.calculateIDF() options['tfidf'] and calculateTFIDF() log( "time", "Program finished running in " + str(time.time() - programStartTime) + " seconds.")
import numpy as np import matplotlib.pyplot as plt from matplotlib.lines import Line2D import matplotlib.animation as animation from dataParser import DataParser import matplotlib.pyplot as plt from matplotlib import style style.use('dark_background') x = DataParser('weatherNYC.csv') plt.plot(x.dates, x.avgs) plt.title('temperatura w NYC w 2016 (w ℉)') plt.show()
def test_mins(self): x = DataParser('test.csv') self.assertEqual(x.mins, [71, 75])
def test_maxs(self): x = DataParser('test.csv') self.assertEqual(x.maxs, [89, 91])
def test_precipations(self): x = DataParser('test.csv') self.assertEqual(x.precipations, [0, 0.22])
def test_avgs(self): x = DataParser('test.csv') self.assertEqual(x.avgs, [80, 83])