def get_data_list_preparative(sheet_data): list_title = {} list_subproject, list_year_month, dict_statistics_data = ReadFromFile.read_statistics_data(sheet_data) dict_item_cost = {} table_used = ReadFromFile.read_preparative_data('预统计数据清单(正式账户).csv', '数据清单(正式账户).csv') table_used.sort(key=itemgetter('子产品名称')) for name, items_groupby_name in groupby(table_used, key=itemgetter('子产品名称')): #print(' ', name)#子产品名称 item_cost = {} list_subproject.append(name) list_groupby_date = list(items_groupby_name) list_groupby_date.sort(key=itemgetter('结束使用时间')) for year_month, items_groupby_date in groupby(list_groupby_date, key=itemgetter('结束使用时间')): #print(year_month)#年月 list_year_month.append(year_month) item_cost[year_month] = 0 for item in items_groupby_date: item_cost[year_month] = item_cost[year_month] + float(item['现金账户支出(元)']) dict_item_cost[name] = item_cost #print(item_cost)#子产品每月花费 #print(dict_item_cost) list_year_month = list(set(list_year_month)) list_subproject = list(set(list_subproject)) list_year_month.sort() list_subproject.sort() #print(dict_item_cost) #print(list_year_month) #print(list_name) #ReadFromFile.WriteToFile.write_statistics_data(sheet_data, list_subproject, list_year_month, dict_item_cost) return list_subproject, list_year_month, dict_item_cost
def main(): if '-h' in sys.argv: print_usage_message() exit() vectorFileName = fn.create_prof_vect_name(sys.argv) corrFileName = fn.create_correlations_name(sys.argv) if not os.path.exists(corrFileName): tokenVects = read.word_vects(vectorFileName) if tokenVects is None: print("Specified vector file not found.") print("To create vectors use 'createProfVectors.py'") exit() ratingVect = read.overall_rating_vect(vectorFileName) vocabVect = read.vocab_from_vect_file(vectorFileName) corrTups = stat.find_correlations(tokenVects, ratingVect, vocabVect) write.token_correlations(corrTups, corrFileName) else: corrTups = read.token_correlations(corrFileName) corrPlotFileName = None if '-save' in sys.argv: corrPlotFileName = fn.create_correlations_plot_name(sys.argv) # Plot correlations plot.tuple_pair_score_correlation( corrTups, title=plot.create_token_pair_score_correlation_name(sys.argv), saveFile=corrPlotFileName)
def create_prof_vectors(tokenSchema, argv, profDicts=None, profTokenDict=None): """ Create token count vectors for the aggrigate reviews of each professor. """ if profDicts is None: profDicts = read.prof_dicts() if profTokenDict is None: ptdName = fn.create_prof_token_dict_name(argv) profTokenDict = read.prof_token_dicts(ptdName) schemaDict = value_idx_dict(tokenSchema) profVects = [] pidsNotIncluded = [] for prof in profDicts: newVect = create_prof_vector( prof, count.combine_rev_counters(profTokenDict[prof['pid']]), schemaDict) if newVect['token_vect'] is None: pidsNotIncluded.append(newVect['pid']) else: profVects.append(newVect) pidsNotIncluded.sort() return profVects, pidsNotIncluded
def non_single_small_idxs(pidVect): singlePids = set(read.pids_file(fn.PidsSingleRevFile)) smallPids = set(read.pids_file(fn.PidsSmallRevLenFile)) nonSingleSmallIdxs = [ idx for idx, pid in enumerate(pidVect) if pid not in singlePids and pid not in smallPids ] return np.array(nonSingleSmallIdxs)
def create_rev_vectors(tokenSchema, argv, profDicts=None, profTokenDict=None): if profDicts is None: profDicts = read.prof_dicts() if profTokenDict is None: ptdName = fn.create_prof_token_dict_name(argv) profTokenDict = read.prof_token_dicts(ptdName) schemaDict = value_idx_dict(tokenSchema) revVects = [] for prof in profDicts: for rev in prof['reviews']: revVects.append(create_rev_vector(rev, schemaDict))
def main(): if '-h' in sys.argv: print_usage_message() exit() # Open profTokenDicts with raw word count profDicts = read.prof_dicts() singlePids = [] smallLenPids = [] # Iterate through prof token dicts for prof in profDicts: if len(prof['reviews']) == 1: singlePids.append(prof['pid']) total = 0 for rev in prof['reviews']: total += len(rev['text']) if total <= MaxWordCount: smallLenPids.append(prof['pid']) print("Num singlePids:", len(singlePids)) print("Num small pids:", len(smallLenPids)) singlePids.sort() smallLenPids.sort() write.pids_file(singlePids, fn.PidsSingleRevFile) write.pids_file(smallLenPids, fn.PidsSmallRevLenFile)
def speed_of_sound(temp, press, substance=None, formula=None): """ Calculates the speed of sound for a pure substance :param substance: The substance :type substance: str :param formula: The substance formula :type formula: str :param temp: The substance temperature (K) :type temp: float :param press: The substance pressure (Pa) :type press: float :return: Speed of Sound, vs (m/s) :rtype: float """ Cp = cp(temp, substance=substance, formula=formula) Cv = cv(temp, press, substance=substance, formula=formula) try: MW, Tc, Pc, Ttrip, Ptrip, Acentric = read.get_phase_change_data( name=substance, formula=formula) except: t = '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) logging.error( '{0} Volumetric Heat Capacity: Error loading phase change data.'. format(t)) raise ValueError dPdV = pr.dPdV(temp, press, Tc, Pc, Acentric) vol = pr.volume(temp, press, Tc, Pc, Acentric) return np.sqrt(-1 / (MW * 0.001) * Cp / Cv * dPdV) * vol
def main(): if '-h' in sys.argv: print_usage_message() exit() stmr = None stopwords = None if '-ss' in sys.argv: stmr = LancasterStemmer() stopwords = read.stopwords(stmr) countNames = fn.create_token_count_names(sys.argv) rawTokenCountName = countNames[0] revTokenCountName = countNames[1] profTokenCountName = countNames[2] rawTokens = read.token_count(rawTokenCountName, True) revTokens = read.token_count(revTokenCountName, True) profTokens = read.token_count(profTokenCountName, True) if rawTokens == None or revTokens == None or profTokens == None: profTokenDict = grab_prof_token_dict(stopwords, stmr) if rawTokens == None: rawTokens = grab_token_count(profTokenDict, count.num_tokens, rawTokenCountName) if revTokens == None: revTokens = grab_token_count(profTokenDict, count.num_reviews_with_token, revTokenCountName) if profTokens == None: profTokens = grab_token_count(profTokenDict, count.num_profs_with_token, profTokenCountName) plotName = create_plot_name() plotFileName = None if '-save' in sys.argv: plotFileName = fn.create_count_plot_name(sys.argv) plot.token_counts(rawTokens, revTokens, profTokens, plotFileName, plotName)
def process_token_vectors(vects, argv): if '-tf' in argv: vects = np.apply_along_axis(to_tf_vect, 1, vects) elif '-tfidf' in argv: vocab = read.vocab_from_vect_file(fn.create_prof_vect_name(argv)) idfVect = create_idf_vect(vocab, vects.shape[0], argv) print(idfVect.shape, vects.shape) vects = np.apply_along_axis(lambda x: to_tf_idf_vect(x, idfVect), 1, vects) return vects
def specific_gravity_c7plus(n, fractions): M, gamma = list(), list() for i in range(0, len(n)): M.append(read.get_phase_change_data(scn=n[i])[0]) gamma.append(read.get_phase_change_data(scn=n[i])[4]) sumM = 0 for i in range(0, len(n)): sumM += fractions[i] * M[i] fraction7 = 0 for i in range(6, len(n)): fraction7 += fractions[i] * M[i] density = 0 for i in range(6, len(n)): density += fractions[i] * M[i] / (fraction7 * gamma[i]) return 1 / density
def MW_c7plus(n, fractions): z_c7p = z_c7plus(n, fractions) index7 = 0 while n[index7] < 8: index7 += 1 index7 -= 1 MW_c7p = 0 for i in range(index7, len(n)): MW_c7p += read.get_phase_change_data( scn=n[i])[0] * fractions[i] / z_c7p return MW_c7p
def token_schema_from_count(argv): countsFileName = fn.create_token_count_names(argv) countsFileName = countsFileName[1] # Num revs token appears in tokenCounts = read.token_count(countsFileName) if tokenCounts is None: print("Token count file not found.") print("Create token count file using 'countTokens.py'") exit() minCount = int(argv[argv.index('-minCount') + 1]) tokenSchema = [t for t, c in tokenCounts.items() if c >= minCount] tokenSchema.sort() return tokenSchema
def create_idf_vect(vocab, numProfs, argv): """ vocab is expected to be a python list """ countFileName = fn.create_token_count_names(argv) countFileName = countFileName[2] tokCounts = read.token_count(countFileName) countVect = np.zeros(len(vocab), dtype=float) for idx, word in enumerate(vocab): countVect[idx] = tokCounts[word] return np.log(numProfs / countVect)
def __init__(self, path): self.path = path # Utility that allows reading of the file contents self.util = rff.ReadFile() # Run the utility on the path given self.util.Run(self.path) # Separates each attribute type and it's examples into it's own list self.ParseData() self.ExampleToDataType() self.transformed_examples = copy.deepcopy(self.examples.copy()) self.transformed_attributes = self.attributes.copy() # Takes the attributes of each example and transforms it to a number self.TransformAttributes() self.TransformExamples(self.transformed_examples)
def cv(temp, press, substance=None, formula=None): """ Calculates the volumetric heat capacity of a substance. Cv = Cp + T * dVdP * dPdT**2 :param substance: The substance :type substance: str :param formula: The substance formula :type formula: str :param temp: The substance temperature (K) :type temp: float :param press: The substance pressure (Pa) :type press: float :return: cv (J/mol-K) :rtype: float """ if substance is None and formula is None: t = '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) logging.error( '{0} Volumetric Heat Capacity: No name or formula input.'.format( t)) raise ValueError try: MW, Tc, Pc, Ttrip, Ptrip, Acentric = read.get_phase_change_data( name=substance, formula=formula) except: t = '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) logging.error( '{0} Volumetric Heat Capacity: Error loading phase change data.'. format(t)) raise ValueError Cp = cp(temp, substance=substance, formula=formula) dPdT = pr.dPdT(temp, press, Tc, Pc, Acentric)[0] dVdP = 1 / pr.dPdV(temp, press, Tc, Pc, Acentric)[0] return Cp + temp * dVdP * dPdT**2
def cp(temp, substance=None, formula=None): """ Calculates the constant pressure heat capacity of a substance. Cp = A + B/1e2 T + C/1e5 T^2 + D/1e9 T^3 :param substance: The substance :type substance: str :param formula: The substance formula :type formula: str :param temp: The substance temperature (K) :type temp: float :return: cp (J/mol-K) :rtype: float """ if substance is None and formula is None: t = '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) logging.error('{0} Heat Capacity: No name or formula input.'.format(t)) raise ValueError const = read.get_heat_capacity_constants(name=substance, formula=formula) R = 8.314459848 # Gas Constant: m^3 Pa mol^-1 K^-1 3.64*R -1.101e-3*R*temp +2.466e-6*R*temp**2 -0.942e-9*R*temp**3 return const[0] + const[1] * 1e-2 * temp + const[ 2] * 1e-5 * temp**2 + const[3] * 1e-9 * temp**3
def token_schema_from_correlations(argv): corrFileName = fn.create_correlations_name(argv) corrTups = read.token_correlations(corrFileName) if corrTups is None: print("Correlations file not found") print("Create correlations file with 'findCorrelations.py'") exit() corIdx = argv.index('-corr') minCount = int(argv[corIdx + 1]) minScore = float(argv[corIdx + 2]) reducedTups = [(cor[0], cor[1]) for cor in corrTups if cor[2] >= minCount and cor[3] >= abs(minScore)] tokenSet = set() for tok1, tok2 in reducedTups: tokenSet.add(tok1) tokenSet.add(tok2) tokenSchema = list(tokenSet) tokenSchema.sort() return tokenSchema
def grab_prof_token_dict(stopwords, stmr): filename = fn.create_prof_token_dict_name(sys.argv) if os.path.exists(filename): with open(filename, 'rb') as f: profTokenDict = pickle.load(f) return profTokenDict token_f = lambda t: count.create_single_tokens(t, stopwords, stmr) if '-tup' in sys.argv: token_f = lambda t: count.create_tuple_tokens(t, stopwords, stmr) elif '-stup' in sys.argv: token_f = ( lambda t: count.create_single_tuple_tokens(t, stopwords, stmr)) profs = read.prof_dicts() profTokenDict = count.create_prof_token_dict(profs, token_f) with open(filename, 'wb') as f: pickle.dump(profTokenDict, f) return profTokenDict
logger = logging.getLogger(name) logger.setLevel(level) logger.addHandler(handler) return logger # Setup Log Files root_path = os.path.dirname(os.path.realpath(__file__)) runlog = setup_logger('runlog', root_path + '/Logs/run.log', level=logging.DEBUG) alglog = setup_logger('alglog', root_path + '/Logs/alg.log') runlog.info('START Decline Curve Analysis.') df = read.production_monthyear(root_path + '/Data/spindletop.csv') prod = read.production_by_month(df) prod1 = np.extract( np.extract(prod[0] < 1926, prod) > 1902.6, np.extract(prod[0] < 1926, prod[1])) time1 = [ i - 1902.6 + 1e-9 for i in np.extract( np.extract(prod[0] < 1926, prod) > 1902.6, np.extract(prod[0] < 1926, prod[0])) ] time1 = np.extract(~np.isnan(prod1), time1) prod1 = np.extract(~np.isnan(prod1), prod1) prod2 = np.extract(prod[0] > 1926.5, prod[1]) time2 = np.extract(prod[0] > 1926.5, prod[0]) - 1926.5 time2[0] = 1e-8
def main(): if '-h' in sys.argv: print_usage_message() exit() vectorFileName = fn.create_prof_vect_name(sys.argv, True) tokenVects = read.word_vects(vectorFileName) if tokenVects is None: print("Could not find token vects") print("Use 'createProfVectors.py' to create vectors") exit() tokenVects = vp.process_token_vectors(tokenVects, sys.argv) if '-d' in sys.argv: ratings = read.difficulty_rating_vect(vectorFileName) else: ratings = read.overall_rating_vect(vectorFileName) # Create Training and validation sets pidVect = read.pid_vect(vectorFileName) nonSingleSmallIdxs = ffnn.non_single_small_idxs(pidVect) singleIdxs = vp.pids_to_idxs(pidVect, read.pids_file(fn.PidsSingleRevFile)) smallIdxs = vp.pids_to_idxs(pidVect, read.pids_file(fn.PidsSmallRevLenFile)) singleSmallIdxs = list(set(singleIdxs).union(set(smallIdxs))) singleSmallIdxs.sort() singleSmallIdxs = np.array(singleSmallIdxs) trainingVects = tokenVects[nonSingleSmallIdxs, :] trainingRatings = ratings[nonSingleSmallIdxs] validVects = tokenVects[singleSmallIdxs, :] validRatings = ratings[singleSmallIdxs] print(trainingVects.shape, trainingRatings.shape, validVects.shape, validRatings.shape) """ xTrain, xValid, yTrain, yValid = train_test_split(tokenVects, ratings, test_size=0.3) """ # Select and train model if '-deep' in sys.argv: model = ffnn.deep_model(tokenVects.shape[1]) else: model = ffnn.shallow_model(tokenVects.shape[1]) history = model.fit(trainingVects, trainingRatings, epochs=10, batch_size=5, validation_data=(validVects, validRatings)) plotTitle = plot.ffnn_error_title(sys.argv) outfile = None if '-save' in sys.argv: outfile = fn.create_ffnn_plot_name(sys.argv) plot.ffnn_error(history, title=plotTitle, filename=outfile)
#!/usr/bin/python import ReadFromFile import updateAuthorIdinPaperAuthor import InsertIntoAuthorAuthor ReadFromFile.do() updateAuthorIdinPaperAuthor.do() InsertIntoAuthorAuthor.do()
#print(item_cost)#子产品每月花费 #print(dict_item_cost) list_year_month = list(set(list_year_month)) list_subproject = list(set(list_subproject)) list_year_month.sort() list_subproject.sort() #print(dict_item_cost) #print(list_year_month) #print(list_name) #ReadFromFile.WriteToFile.write_statistics_data(sheet_data, list_subproject, list_year_month, dict_item_cost) return list_subproject, list_year_month, dict_item_cost sheet_data = '正式账户' list_subproject_statistics, list_year_month_statistics, dict_statistics_data = ReadFromFile.read_statistics_data(sheet_data) list_subproject_preparative, list_year_month_preparative, dict_preparative_data = get_data_list_preparative(sheet_data) list_subproject_preparative.extend(list_subproject_statistics) list_subproject = list(set(list_subproject_preparative)) list_subproject.sort() list_year_month_preparative.extend(list_year_month_statistics) list_year_month = list(set(list_year_month_preparative)) list_year_month.sort() for preparative_key,preparative_value in dict_preparative_data.items(): print(preparative_key in dict_statistics_data.keys()) if dict_statistics_data[preparative_key] is not None: print(preparative_value) for preparative_value_key,preparative_value_value in preparative_value.items(): print(preparative_value_key in dict_statistics_data[preparative_key].keys()) ## if (preparative_value_key in dict_statistics_data[preparative_key].keys()): ## if dict_statistics_data[preparative_key][preparative_value_key] is not None:
def main(): if '-h' in sys.argv: print_usage_message() exit() profDicts = read.prof_dicts() revLens = st.rev_len_arr(profDicts) print("Number of Reviews:", revLens.shape[0]) print("Mean review length:", revLens.mean()) print("Std Dev review length:", revLens.std()) print() numRevsProf = st.num_revs_profs(profDicts) print("Number of professors:", numRevsProf.shape[0]) print("Mean num reviews per prof:", numRevsProf.mean()) print("Std Dev num revews per prof:", numRevsProf.std()) print() profRevLen = st.profs_revs_len(profDicts) print("Mean tokens per prof:", profRevLen.mean()) print("Std Dev tokens per prof:", profRevLen.std()) print() overRats = np.array([prof['rating_overall'] for prof in profDicts], dtype=float) diffRats = np.array([prof['rating_difficulty'] for prof in profDicts], dtype=float) overRatMean = overRats.mean() diffRatMean = diffRats.mean() print("Overall ratings mean:", overRatMean) print("Overall ratings std dev:", overRats.std()) print("Difficulty ratings mean:", diffRatMean) print("Difficulty ratings std dev:", diffRats.std()) print() overMeanDiff = overRats - overRatMean overMeanDiff = np.abs(overMeanDiff) diffMeanDiff = diffRats - diffRatMean diffMeanDiff = np.abs(diffMeanDiff) print("Nieve approach to prediction: Guessing the Mean") print("All profs") print("Overall absolute error mean:", overMeanDiff.mean()) print("Overall absolute error std div:", overMeanDiff.std()) print("Difficulty absolute error mean:", diffMeanDiff.mean()) print("Difficulty absolute error std div:", diffMeanDiff.std()) print() oneRevPids = set(read.pids_file(fn.PidsSingleRevFile)) oneOverRats = np.array([ prof['rating_overall'] for prof in profDicts if prof['pid'] in oneRevPids ], dtype=float) oneOverDiff = np.abs(oneOverRats - oneOverRats.mean()) print("Profs with one review") print("One review absolute error mean:", oneOverDiff.mean()) print("One review absolute error std div:", oneOverDiff.std()) print() smallRevPids = set(read.pids_file(fn.PidsSmallRevLenFile)) smallOverRats = np.array([ prof['rating_overall'] for prof in profDicts if prof['pid'] in smallRevPids ], dtype=float) smallOverDiff = np.abs(smallOverRats - smallOverRats.mean()) print("Profs with short reviews") print("Small review absolute error mean:", smallOverDiff.mean()) print("small review absolute error std div:", smallOverDiff.std()) print() save = False if '-save' in sys.argv: save = True plot.plot_word_review_count(revLens, profRevLen, numRevsProf, save=save)
def main(): if '-h' in sys.argv: print_usage_message() exit() vectFileName = fn.create_prof_vect_name(sys.argv, True) simMatFileName = fn.create_sim_mat_name(sys.argv) predsFileName = fn.create_preds_name(sys.argv) print(vectFileName) print(simMatFileName) print(predsFileName) # Grab the ratings vector if '-d' in sys.argv: ratings = read.difficulty_rating_vect(vectFileName) else: ratings = read.overall_rating_vect(vectFileName) # Assign similarity metric sim_f = vp.inverse_euclidean_distance if '-cos' in sys.argv: sim_f = vp.cosine_similarity elif '-pear' in sys.argv: sim_f = vp.abs_pearson_correlation # Set if weighted or not weighted = True if '-unweighted' in sys.argv: weighted = False # Grab predictions or create them if not available predictions = read.knn_predictions(predsFileName) if predictions is None: simMat = read.similarity_matrix(simMatFileName) if simMat is None: wordVects = read.word_vects(vectFileName) if wordVects is None: print("Vector file " + vectFileName + " does not exist") exit() wordVects = vp.process_token_vectors(wordVects, sys.argv) simMat = knn.get_similarity_matrix(wordVects, sim_f) write.similarity_matrix(simMat, simMatFileName) predictions = knn.knn_dataset(ratings, MaxK, simMat, weighted) write.knn_predictions(predictions, predsFileName) idxToPlot = None if '-maxK' in sys.argv: maxK = int(sys.argv[sys.argv.index('-maxK') + 1]) predictions = predictions[:, :maxK] pidVect = read.pid_vect(vectFileName) singleRevIdxs = vp.pids_to_idxs(pidVect, read.pids_file(fn.PidsSingleRevFile)) smallLenIdxs = vp.pids_to_idxs(pidVect, read.pids_file(fn.PidsSmallRevLenFile)) plotFileName = None if '-save' in sys.argv: plotFileName = fn.create_knn_accuracy_plot_name(sys.argv) # Output results of the run plot.knn_error( predictions, ratings, title=plot.create_knn_error_title(sys.argv), idxToPlot=[singleRevIdxs, smallLenIdxs], subTitles=[ "Error with profs with one review", "Error with profs with aggrigate review " + "lengths one std div above the mean " + "review length or less" ], saveFile=plotFileName)
handler = logging.FileHandler(log_file) handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s')) logger = logging.getLogger(name) logger.setLevel(level) logger.addHandler(handler) return logger # Setup Log Files root_path = os.path.dirname(os.path.realpath(__file__)) runlog = setup_logger('runlog', root_path + '/Logs/run.log', level=logging.DEBUG) alglog = setup_logger('alglog', root_path + '/Logs/alg.log') runlog.info('START Thermodynamic Analysis of Multi-Phase Petroleum Fluids.') names, number, fractions = read.scn_composition(root_path + '/Data/composition.txt') # delta = read.get_binary_interations(scn_list=number) mw_raw = list() for n in range(0, len(number)): mw_raw.append(read.get_phase_change_data(scn=number[n])[0]) mw30 = list() for n in range(0, 30): mw30.append(read.get_phase_change_data(scn=n + 1)[0]) zc7p = scn.z_c7plus(number, fractions) Mc7p = scn.MW_c7plus(number, fractions) gc7p = scn.specific_gravity_c7plus(number, fractions) fraction7 = list()