def fromRelativePath(relPath: str) -> BatchPipeline: """Constructs a pipeline from config found at relative path. Relative config overwrites general config found at `$LCML/conf/common/pipeline.json` :param relPath: relative path to specific config overriding default config :return: constructed BatchPipeline object """ defaultConf = loadJson(joinRoot(_DEFAULT_PIPE_CONF_REL_PATH)) relConf = loadJson(joinRoot(relPath)) conf = recursiveMerge(defaultConf.copy(), relConf) t = PrettyTable(["global param", "value"]) t.align = "l" for k, v in sorted(conf[GLOBAL_PARAMS].items()): t.add_row([k, v]) logger.info("Global params\n%s", str(t)) pipeConf = loadPipelineConf(conf) pipeType = pipeConf.globalParams["type"] if pipeType == "supervised": pipe = SupervisedPipeline(pipeConf) elif pipeType == "unsupervised": pipe = UnsupervisedPipeline(pipeConf) else: raise ValueError("unsupported pipeline type: %s" % pipeType) return pipe
def main(): inPath = joinRoot("data/macho/macho-classifications.csv") outDir = joinRoot("data/macho/class") commandBase = tapCommandBase() query = ("SELECT dateobs, rmag, rerr, bmag, berr " "FROM public.photometry_view " "WHERE fieldid=%s AND tileid=%s AND seqn=%s") classCounts = defaultdict(int) classData = np.loadtxt(fname=inPath, dtype=int, delimiter=",", skiprows=1) logger.critical("processing %d requests", len(classData)) smallestLcNoRetry = 20 for field, tile, seqn, classif in classData: classCounts[classif] += 1 fname = "field=%s_tile=%s_seqn=%s_class=%s" % (field, tile, seqn, classif) outPath = os.path.join(outDir, fname + ".csv") if os.path.exists(outPath): _tempData = np.loadtxt(outPath, dtype=str, delimiter=",") if len(_tempData) > smallestLcNoRetry: # skip downlaod if we already have a file with sufficient data # logger.critical("skipping %s", fname) continue logger.critical(outPath) fullQuery = query % (field, tile, seqn) cmd = commandBase + ["adql=" + fullQuery, "out=" + outPath] try: subprocess.check_output(cmd) except CalledProcessError: logger.exception("JAR call failed") continue # +----------+--------+------------+ # | Category | Counts | Percentage | # +----------+--------+------------+ # | 1 | 7405 | 34.48 | # | 2 | 1765 | 8.22 | # | 3 | 315 | 1.47 | # | 4 | 1185 | 5.52 | # | 5 | 683 | 3.18 | # | 6 | 315 | 1.47 | # | 7 | 822 | 3.83 | # | 8 | 1134 | 5.28 | # | 9 | 778 | 3.62 | # | 10 | 6835 | 31.83 | # | 11 | 237 | 1.1 | # +----------+--------+------------+ t = PrettyTable(["Category", "Counts", "Percentage"]) totalCounts = sum(classCounts.values()) for cat, counts in sorted(classCounts.items()): t.add_row([cat, counts, round(100.0 * counts / totalCounts, 2)]) logger.critical("\n" + str(t))
def main(): start = time.time() args = _getArgs() dataset = "macho" dataDir = joinRoot("data", dataset) outDir = joinRoot("results", dataset) if not os.path.exists(outDir): os.makedirs(outDir) logger.info("Loading RF classifier...") randomForestModel = upsilon.load_rf_model() runDataset(dataDir, randomForestModel, outDir, args.threads, args.rows) logger.info("finished in: %.2fs", time.time() - start)
def loadFlatLcDataset(params: dict, dbParams: dict, table: str, limit: float): """Loads and aggregates light curves from single csv file of individual data points storing results in a database.""" dataPath = joinRoot(params["relativePath"]) logger.info("Loading from: %s", dataPath) skiprows = params["skiprows"] commitFrequency = dbParams["commitFrequency"] dataName = params["dataName"] logger.info("Using %s LC adapter", dataName) if dataName == "ogle3": adapter = Ogle3Adapter elif dataName == "macho": adapter = MachoAdapter elif dataName == "k2": adapter = K2Adapter else: raise ValueError("Unsupported dataName: %s" % dataName) conn = connFromParams(dbParams) cursor = conn.cursor() reportTableCount(cursor, table, msg="before loading") insertOrReplaceQuery = INSERT_REPLACE_INTO_LCS % table with open(dataPath, "r") as f: reader = csv.reader(f, delimiter=",") for _ in range(skiprows): next(reader) completedLcs = 0 uid = label = times = mags = errors = None for row in reader: if adapter.rowEquals(row, uid): # continue building current LC adapter.appendRow(times, mags, errors, row) else: if uid is not None: # finish current LC, except for first time args = (uid, label) + serLc(times, mags, errors) cursor.execute(insertOrReplaceQuery, args) completedLcs += 1 if logger.isEnabledFor(logging.DEBUG): logger.debug("completed lc with len: %s", len(times)) if not completedLcs % commitFrequency: logger.info("committing progress: %s", completedLcs) conn.commit() if completedLcs >= limit: break # initialize new LC uid, label, times, mags, errors = adapter.initLcFrom(row) logger.info("committing progress: %s", completedLcs) conn.commit() reportTableCount(cursor, table, msg="after loading") conn.close()
def getDatasetFilePaths(datasetName: str, ext: str) -> List[str]: """Returns the full paths of all dataset files in project data directory: ./light_curve_ml/data/ :param datasetName - Name of specific data whose individual file paths will be returned :param ext - Required file extension of dataset files """ path = joinRoot("data", datasetName) return [os.path.join(path, f) for f in os.listdir(path) if f.endswith(ext)]
def connFromParams(dbParams: dict) -> Union[Connection, None]: p = joinRoot(dbParams["dbPath"]) timeout = dbParams["timeout"] conn = None try: conn = sqlite3.connect(p, timeout=timeout) except sqlite3.OperationalError: logger.exception("Cannot resolve path: %s", p) return conn
def main(): paths = [ "data/macho/macho-sample.csv", "data/ucr_lcs/StarLightCurves_TEST.csv" ] for path in paths: fullPath = joinRoot(path) ext = path.split(".")[-1] if ext == "csv": obj = loadCsv(fullPath) elif ext == "json": obj = loadJson(fullPath) else: print("bad ext: " + ext) continue dumpWhereFound(obj, fullPath, ext)
def main(): np.random.seed(0) dataPath = joinRoot("data/rf/breast-cancer-wisconsin.csv") dataset = pd.read_csv(dataPath) headers = list(dataset) description = dataset.describe() print("Missing features: %s" % missingFeatures(dataset, description)) # remove missing data dataset = dataset[dataset[headers[6]] != "?"] # trim away 'CodeNumber' and 'CancerType' columns featureHeaders = dataset[headers[1:-1]] targetHeaders = dataset[headers[-1]] trainRatio = 0.7 xTrain, xTest, yTrain, yTest = train_test_split(featureHeaders, targetHeaders, train_size=trainRatio) # Train and Test dataset size details print("\nTrain & Test sizes") print("Train_x Shape: ", xTrain.shape) print("Train_y Shape: ", yTrain.shape) print("Test_x Shape: ", xTest.shape) print("Test_y Shape: ", yTest.shape) model = trainRfClassifier(xTrain, yTrain) testPredictions = model.predict(xTest) trainPredictions = model.predict(xTrain) reportSample = 10 print("\nSample performance") t = PrettyTable(["Predicted", "Actual"]) # convert the dataframe in to list object the indexes will be in order testYList = list(yTest) for i in range(0, reportSample): t.add_row([testYList[i], testPredictions[i]]) print(t) # accuracy print("\nFull performance") print("Train accuracy: ", accuracy_score(yTrain, trainPredictions)) print("Test accuracy: ", accuracy_score(yTest, testPredictions)) print("Confusion: ", confusion_matrix(yTest, testPredictions))
def main(): """Generates a .csv file containing the labeled MACHO training set. Columns of macho-train.csv output: 0 - macho_uid 1 - classification 2 - date_observed 3 - magnitude 4 - error Additionally generates a second csv file containing the UIDs of missing data files. """ inDir = joinRoot("data/macho/class") redBands = [ ",".join([ "field-tile-seqn-band", "classLabel", "date_observed", "magnitude", "error" ]) + "\n" ] blueBands = [] # N.B. pt1 generated file names of the form: # 'field=1_tile=33_seqn=10_class=6.csv' pattern = r"""\d+""" dataLengths = Counter() # Heading for missing UID file missing = [",".join(("field", "tile", "seqn")) + "\n"] for f in absoluteFilePaths(inDir, ext="csv"): try: data = np.loadtxt(f, skiprows=1, delimiter=",") except ValueError: logger.critical("can't load file: %s", f) continue fileName = f.split("/")[-1].split(".")[0] field, tile, seqn, classNum = re.findall(pattern, fileName) label = MACHO_NUM_TO_LABEL[classNum] prefix = [field, tile, seqn] for r in data: # column format for source file # 0=dateobs, 1=rmag, 2=rerr, 3=bmag, 4=berr # uid, class label, dateobs, rmag, rerr _rVals = [machoUid(prefix + ["R"]), label ] + [str(_) for _ in r[:3]] # uid, class label, dateobs, bmag, berr _bVals = ([machoUid(prefix + ["B"]), label] + [str(r[0])] + [str(_) for _ in r[3:]]) redBands.append(",".join(_rVals) + "\n") blueBands.append(",".join(_bVals) + "\n") dataLengths[len(data) // 10] += 1 # data length histogram in 10s if not len(data): missing.append(",".join((field, tile, seqn)) + "\n") outDir = joinRoot("data/macho") trainFile = os.path.join(outDir, "macho-train.csv") with open(trainFile, "w") as f: f.writelines(redBands) f.writelines(blueBands) missingFile = os.path.join(outDir, "macho-train-fails.csv") with open(missingFile, "w") as f: f.writelines(missing) logger.critical("LC length distribution: %s", sorted(list(dataLengths.items())))
def main(): outDir = joinRoot("data/macho/raw") commandBase = tapCommandBase() returnedLimit = 500000 limit = int(10e7) # testQuery = "SELECT TOP 10 * FROM public.star_view" joinQuery = ( "SELECT TOP %s b.poc, a.fieldid, a.tileid, a.seqn, " "a.obsid, a.dateobs, a.rmag, a.rerr, a.bmag, a.berr " "FROM public.photometry_view AS a " "JOIN public.varstar_view AS b " "ON (a.fieldid=b.field AND a.tileid=b.tile AND a.seqn=b.seqn) " "WHERE a.fieldid=%s AND b.poc='%s'") # Due to a limitation of returning at most 500K records at a time, the data # is grabbed across a series of queries for each observation field and for # each poc category # fields = [1, 2] # fields based on data shown at http://macho.nci.org.au/macho_photometry/ fields = (genList(25, 180) + genList(206, 208) + genList(211, 213) + genList(301, 311) + genList(401, 403)) categoryStart, categoryEnd = 1, 11 classCounts = defaultdict(int) allStart = time.time() for field in fields: for cat in range(categoryStart, categoryEnd + 1): logger.info("Field: %s Class: %s", field, cat) outPath = os.path.join(outDir, "c%s_f%s.csv" % (cat, field)) fullQuery = joinQuery % (limit, field, cat) cmd = commandBase + ["adql=" + fullQuery, "out=" + outPath] apiStart = time.time() try: output = subprocess.check_output(cmd) except CalledProcessError: logger.exception("JAR call failed") return if logger.isEnabledFor(logging.DEBUG): logger.debug("call took: %.01fs", time.time() - apiStart) if output: logger.debug("subprocess output: %s", output.decode("utf-8")) # if outfile is empty, print a warning and delete it with open(outPath, "r") as outFile: outLineCount = sum(1 for _ in outFile) classCounts[cat] += outLineCount if outLineCount == 1: logger.info("Skipping empty result") os.remove(outPath) if outLineCount >= returnedLimit: logger.warning("Reached TAP limit! Data likely missed: %s", outLineCount) t = PrettyTable(["Category", "Counts", "Percentage"]) totalCounts = sum(classCounts.values()) for cat, counts in sorted(classCounts.items()): t.add_row([cat, counts, round(100.0 * counts / totalCounts, 2)]) # +----------+---------+------------+ # | Category | Counts | Percentage | # +----------+---------+------------+ # | 1 | 2668376 | 32.29 | # | 2 | 612715 | 7.41 | # | 3 | 111089 | 1.34 | # | 4 | 619357 | 7.49 | # | 5 | 318188 | 3.85 | # | 6 | 55188 | 0.67 | # | 7 | 152359 | 1.84 | # | 8 | 352080 | 4.26 | # | 9 | 187325 | 2.27 | # | 10 | 3048492 | 36.89 | # | 11 | 138465 | 1.68 | # +----------+---------+------------+ logger.info(t) logger.info("Entire harvest took: %.01fm", (time.time() - allStart) / 60)
def tapCommandBase(jreBinaryPath="/usr/bin/java"): jarPath = joinRoot("jars/stilts.jar") commandBase = [jreBinaryPath, "-jar", jarPath, "tapquery"] return commandBase + [ "tapurl=http://machotap.asvo.nci.org.au/ncitap/tap", "compress=true" ]