Ejemplo n.º 1
0
def _parseFile(filepath):
    #this should only happen when we process the first file ever
    if filepath is None:
        return set(), None, None
    
    data = set()
    
    info = datafiles.read_info_file(filepath)
    if info['date_last_absent'] is None:
        timestamp = util.convert_date_to_millis(info['date_modified']) 
    else:
        timestamp = util.convert_date_to_millis(info['date_first_present'])
        
    csvfile = open(filepath)
    dialect = csv.Sniffer().sniff(csvfile.read(1024))
    csvfile.seek(0)
    reader = csv.DictReader(csvfile, dialect=dialect)
    for row in reader:
        secid = database.getSecidFromCsid(row['GVKEY'], row['IID'], timestamp)
        if secid is None:
            secid = database.createNewCsid(row['GVKEY'], row['IID'], timestamp, None, None, True)
            util.warning("Created new secid: {}.{}=>{}".format(row['GVKEY'], row['IID'], secid))
        data.add((secid, int(row["SPLITDATE"]), float(row["SPLITRATE"])))
        
    #get the file start date from the filename
    startDate = os.path.normpath(filepath).split("/")[-1][0:8] #split the filepath
    startDate = int(startDate)
            
    return data, startDate, timestamp
Ejemplo n.º 2
0
def __getDeltas(filepath, source, treatFutureData=False):
    ######get the previously processed file##########

    localDir = config.load_source_config(source)["local_dir"]
    previousData = set()
    previousFileName = database.getLastProcessedFile(source)
    if previousFileName is not None:
        previousFileName = os.environ[
            "DATA_DIR"] + "/" + localDir + "/" + previousFileName
        previousFileInfo = datafiles.read_info_file(previousFileName)
        previousFileDate = previousFileInfo["date_first_present"]

        firstLoad = False
        zf = zipfile.ZipFile(previousFileName)
        names = zf.namelist()
        assert len(names) == 1
        file = zf.open(names[0])

        #skip header
        file.readline()
        for line in file:
            if treatFutureData:
                effectiveDate = line.strip().split("|")[4]
                effectiveDate = dateutil.parser.parse(effectiveDate +
                                                      " 00:00:00.000000 UTC")
                if effectiveDate < previousFileDate:
                    previousData.add(line)
            else:
                previousData.add(line)

        file.close()
        zf.close()
    else:
        firstLoad = True

    ##########get deltas from previous file#############
    currentData = set()
    zf = zipfile.ZipFile(filepath)
    names = zf.namelist()
    assert len(names) == 1
    file = zf.open(names[0])

    #skip header
    file.readline()
    for line in file:
        currentData.add(line)

    file.close()
    zf.close()

    newData = currentData - previousData
    removedData = previousData - currentData

    return (newData, removedData, firstLoad)
Ejemplo n.º 3
0
def process(filepath, source):
    info = datafiles.read_info_file(filepath)

    if info["date_last_absent"] is not None:
        backfill = 0
        timestamp = util.convert_date_to_millis(info["date_first_present"])
    else:
        backfill = 1
        timestamp = util.convert_date_to_millis(info["date_modified"])

    database.setAttributeAutoCreate(True)

    bad = 0
    data = util.csvdict(open(filepath))
    for row in data:
        ticker = row["Symbol"]
        secid = database.getSecidFromXref("TIC", ticker, timestamp,
                                          "compustat_idhist",
                                          newdb.xrefsolve.preferUS)
        if secid is None:
            continue

        try:
            date = util.convert_date_to_millis(row["Record_Date"])
        except:
            util.warning("Bad date for row: " + str(row))
            bad += 1
        if bad > 20:
            util.error(
                str(bad) +
                " bad lines found. Raising excpeption. Go check file " +
                filepath)
            raise Exception(
                str(bad) +
                " bad lines found. Raising excpeption. Go check file " +
                filepath)

        for sqAtt, ourAtt in attributeMap.iteritems():
            name = ourAtt[0]
            compareWithRecent = ourAtt[1]
            value = row[sqAtt]
            if value == '': value = None
            database.insertAttribute("sec", "n", secid, date, source, name,
                                     value, timestamp, None, backfill, False,
                                     compareWithRecent, approximatelyEqual)
Ejemplo n.º 4
0
def process(filepath, source):
    date = os.path.basename(filepath).split('.')[2]
    born = date + " 09:30 EST"
    date_millis = util.convert_date_to_millis(date)
    born_millis = util.convert_date_to_millis(born)

    # If we have acquisition times, use these for real born_millis time
    info = datafiles.read_info_file(filepath)
    if info['date_last_absent'] is not None:
        born = util.convert_date_to_millis(info['date_first_present'])
        backfill = 0
    else:
        born = util.convert_date_to_millis(date + " 09:30 EST")
        backfill = 1

    database.setAttributeAutoCreate(True)

    for line in file(filepath):
        handle_htb(line, date_millis, born_millis, backfill)
Ejemplo n.º 5
0
def process(filepath, source):
    sourceNameInDatabase = "onlineinvestor"
    info = datafiles.read_info_file(filepath)

    if "hist" in source:
        backfill = 1
        #timestamp will be data dependent
    else:
        backfill = 0
        timestamp = util.convert_date_to_millis(info["date_modified"])

    database.setAttributeAutoCreate(True)

    with open(filepath, "r") as file:
        for line in file:
            tokens = line.split("\t")
            date = util.convert_date_to_millis(tokens[0])
            ticker = tokens[1]
            notes = tokens[2]
            if backfill == 1:
                born = date
            else:
                born = timestamp

            secid = database.getSecidFromXref("TIC", ticker, date, "compustat",
                                              newdb.xrefsolve.preferUS)
            if secid is None:
                util.warning("Failed to map ticker {},{}".format(
                    ticker, tokens[0]))
                return

            coid, issueid = database.getCsidFromSecid(secid)
            assert coid is not None

            database.insertAttribute("co", "s", coid, date,
                                     sourceNameInDatabase, "BUYBACK", notes,
                                     born, None, backfill)
Ejemplo n.º 6
0
            # Load set of seen files
            util.info("Fetching processed files for %s" % source)
            seen = database.getProcessedFiles(source)

            util.info("Intersecting...")
            for row in listing:
                util.debug("Looking at info: %s" % row[0])
                file_path_info = row[0]
                file_path = os.path.normpath(file_path_info[0:-5])
                #file_path_rel = file_path.replace("%s/%s/" % (os.environ["DATA_DIR"], sconfig['local_dir']), "")
                file_path_rel = os.path.relpath(
                    file_path, "/".join(
                        (os.environ["DATA_DIR"], sconfig["local_dir"])))
                if file_path_rel not in seen:
                    info = datafiles.read_info_file(file_path)
                    # If we don't have reliable acquisition times (first fetch), use modified timestamp
                    if info['date_last_absent'] is None:
                        date_released = info['date_modified']
                    else:
                        date_released = info['date_first_present']

                    #if we are processing using lag, do not add file
                    if options.lag is not None and (
                            util.now() - util.convert_date_to_millis(
                                datetime.timedelta(days=options.lag)) <
                            util.convert_date_to_millis(date_released)):
                        continue

                    util.info("Found new file:< %s" % file_path)
                    files.append({
Ejemplo n.º 7
0
        if len(subdirs) == 0:
            errors.append("{}: Never received a file".format(
                sourceConfigFile[:-3]))
            continue

        subdir = subdirs[-1]
        acquireTimestamp = 0L
        for node in os.walk(sourceLocalDir + "/" + subdir):
            dir = node[0]
            files = node[2]
            for file in files:
                if ".info" in file or ".time" in file or ".new" in file:
                    continue

                info = datafiles.read_info_file(dir + "/" + file)
                timestamp = util.convert_date_to_millis(
                    info["date_first_present"])
                if timestamp > acquireTimestamp:
                    acquireTimestamp = timestamp

        now = util.now()
        checkTimestamp = util.convert_date_to_millis(
            cPickle.load(open(timeFile, 'rb')))
        #get the frequency with which we expect new data
        expectedNewDataFrequency = sc.get("new_data_frequency",
                                          defaultNewDataFrequency)

        checkHours = (now - checkTimestamp) / (60 * 60 * 1000)
        checkMins = ((now - checkTimestamp) % (60 * 60 * 1000)) / (60 * 1000)
        acquireHours = (now - acquireTimestamp) / (60 * 60 * 1000)
Ejemplo n.º 8
0
def process(filePath, source, verifyOnly=False):
    #process the RSK files for now
    if filePath.find(".RSK.") < 0:
        return
    file = open(filePath, "r")

    #The first 2 lines should be the pricedate and the modeldate
    tokens = file.readline().strip().split(":")
    if tokens[0] != "PriceDate":
        util.error("It doesn't seem like a barra daily format")
        raise Exception
    else:
        priceDate = __barraDateToCompact(tokens[1].strip())

    tokens = file.readline().strip().split(":")
    if tokens[0] != "ModelDate":
        util.error("It doesn't seem like a barra daily format")
        raise Exception
    else:
        #pass
        modelDate = __barraDateToCompact(tokens[1].strip())

    # If we have acquisition times, use these for real born time.
    # Else, use the priceDate + 1 day
    fileInfo = datafiles.read_info_file(filePath)
    if fileInfo['date_last_absent'] is not None:
        timestamp = util.convert_date_to_millis(fileInfo['date_first_present'])
        backfill = 0
        database.setAttributeAutoCreate(True)
    else:
        date = priceDate + datetime.timedelta(days=1)
        timestamp = util.convert_date_to_millis(date.strftime("%Y%m%d"))
        backfill = 1
        database.setAttributeAutoCreate(True)

    #get the header names. comma separated, surrounded by double quotes
    line = file.readline()
    headers = __getListFromBarraLine(line)

    #init the dabase
    #database.dropXrefCache()
    #database.addXrefCache(timestamp) #cache xrefs

    #######MAPPING VERIFICATION CODE########
    inconcistentMappings = []
    ########################################

    for line in file:
        data = __getListFromBarraLine(line)

        if len(data) != len(headers):
            util.warning("Skipping bad line: {}".format(line))
            continue

        data = dict(zip(headers, data))

        #######MAPPING VERIFICATION CODE########
        if verifyOnly:
            result = __verifyMapping(
                data["BARRID"], util.cusip8to9(data["CUSIP"]), data["TICKER"],
                source, timestamp,
                newdb.xrefsolve.preferUS)  #mirror the getSecid call
            if result is not None: inconcistentMappings.append(result)
            continue
        ########################################

        secid = __getSecId(data["BARRID"], util.cusip8to9(data["CUSIP"]),
                           data["TICKER"], source, timestamp,
                           newdb.xrefsolve.preferUS, filePath)
        if secid is None:
            continue

        #Now, insert barra attributes and attribute values
        __removeUnwantedAttributes(data)
        for attributeName, attributeValue in data.iteritems():
            if isinstance(attributeValue, str):
                table = "s"
            elif isinstance(attributeValue, int):
                table = "n"
            elif isinstance(attributeValue, float):
                table = "n"
            else:
                util.error(
                    "Dude, attribute values should be either int,float or str")
                raise

            #assert attributeName.startswith("INDNAME") and table=="s"

            #With the exeption of capitalization and price, the other barra attributes
            #are attributes that are evaluated monthly. for them, the date should be the
            #model date. price we ignore, while capitatlization, we only create a new tuple
            #if the capitalization has changed more than a threshould since the last date
            #for which we have a tuple
            if attributeName == "PRICE":
                continue
            elif attributeName == "CAPITALIZATION":
                database.insertAttribute(
                    "sec", "n", secid, util.convert_date_to_millis(priceDate),
                    source, attributeName, attributeValue, timestamp, None,
                    backfill, False, True, __capEquals)
            else:
                database.insertAttribute(
                    "sec", table, secid,
                    util.convert_date_to_millis(modelDate), source,
                    attributeName, attributeValue, timestamp, None, backfill)

    file.close()

    #######MAPPING VERIFICATION CODE########
    if verifyOnly:
        return inconcistentMappings
Ejemplo n.º 9
0
def process(filepath, source):
    #if full
    if "full" in source:
        #timestamp=util.convert_date_to_millis("18000101");
        fileInfo = datafiles.read_info_file(filepath)
        timestamp = util.convert_date_to_millis(fileInfo['date_first_present'])
        backfill = 1
        database.setAttributeAutoCreate(True)
        optimize = False
    else:
        fileInfo = datafiles.read_info_file(filepath)
        timestamp = util.convert_date_to_millis(fileInfo['date_first_present'])
        backfill = 0
        database.setAttributeAutoCreate(False)
        optimize = True

    database.setAttributeAutoCreate(True)
    database.setCurrencyAutoCreate(True)

    #open the zipped file
    zf = zipfile.ZipFile(filepath)
    names = zf.namelist()
    assert len(names) == 1
    file = zf.open(names[0])

    #variables that persist through loop
    #presented here for clarity only
    table = None
    keyNames = None
    attributeNames = None
    numOfKeys = None

    if optimize:
        parsedLines = __optimize(file)

    #filter secids and coids to be processed
    if "_g" in source or "global" in source:
        __getGlobalCoids()
        processSecid = __globalSecidFilter
        processCoid = __globalCompanyFilter
    else:
        processSecid = __localSecidFilter
        processCoid = __localCompanyFilter

    #process lines
    counter = 0
    while True:
        if optimize:
            if len(parsedLines) == 0: break
            line = parsedLines.pop(0)

            if len(line) == 3:
                (command, keyValues,
                 attributeValues) = line[0], line[1], line[2]
            elif len(line) == 4:
                (table, numOfKeys, keyNames,
                 attributeNames) = line[0], line[1], line[2], line[3]
                continue
            else:
                continue
        else:
            line = __getSplitLine(file)
            if line is None: break

            if line[0] in ("T", "F", "E"):
                continue
            elif line[0] in ("H"):
                (table, numOfKeys, keyNames,
                 attributeNames) = __parseHeaderLine(line)
                continue
            elif line[0] in ("I,C,D,R"):
                (command, keyValues,
                 attributeValues) = __parseDataLine(line, numOfKeys)
            else:
                util.error("Oh no! a K command: {}".format(line))
                continue

        #progress
        counter = counter + 1
        if counter % 10000 == 0:
            util.info("{}: Processing line {}k".format(datetime.datetime.now(),
                                                       counter / 1000))

        #remove keys that are replicated in attributes
        keys = {}
        keys.update(zip(keyNames, keyValues))
        attributes = {}

        if command in ("I", "C"):
            for n, v in zip(attributeNames, attributeValues):
                if n not in keys and v != "": attributes[n] = v
        elif command in ("D"):
            for n, v in zip(attributeNames, attributeValues):
                if n not in keys and v == " ": attributes[n] = None
        elif command in ("R"):
            for n, v in zip(attributeNames, attributeValues):
                if n not in keys: attributes[n] = None

        if table == "security":
            __processSecurity(command, keys, attributes, timestamp, source,
                              backfill, processCoid, processSecid)
        elif table == "sec_dprc":
            __processPrice(command, keys, attributes, timestamp, source,
                           backfill, processCoid, processSecid)
            __processCSHOC(command, keys, attributes, timestamp, source,
                           backfill, processCoid, processSecid)
        elif table == "company":
            __processCompany(command, keys, attributes, timestamp, source,
                             backfill, processCoid, processSecid)
        elif table == "sec_divid":
            __processDividend(command, keys, attributes, timestamp, source,
                              backfill, processCoid, processSecid)
        elif table == "sec_split":
            __processSplit(command, keys, attributes, timestamp, source,
                           backfill, processCoid, processSecid)
        elif table == "co_industry":
            __processIndustry(command, keys, attributes, timestamp, source,
                              backfill, processCoid, processSecid)
        elif table == "co_hgic":
            __processHgic(command, keys, attributes, timestamp, source,
                          backfill, processCoid, processSecid)
        elif table in ("co_afnd1", "co_afnd2", "co_ifndq", "co_ifndsa",
                       "co_ifndytd"):
            __processFundamental(command, keys, attributes, timestamp, source,
                                 backfill, processCoid, processSecid)
        elif table in ("co_idesind", 'co_adesind'):
            __processDesind(command, keys, attributes, timestamp, source,
                            backfill, processCoid, processSecid)
        elif table in ("co_amkt", 'co_imkt'):
            __processMkt(command, keys, attributes, timestamp, source,
                         backfill, processCoid, processSecid)
        elif table == "co_filedate":
            __processFiledate(command, keys, attributes, timestamp, source,
                              backfill, processCoid, processSecid)
        elif table == "adsprate":
            __processCredit(command, keys, attributes, timestamp, source,
                            backfill, processCoid, processSecid)
        elif table == "exrt_dly":
            __processExchange(command, keys, attributes, timestamp, source,
                              backfill)
        else:
            continue

    #__processBufferedFundamentals(source, backfill, buffer)
    file.close()
    zf.close()
Ejemplo n.º 10
0
def _parseFile(filepath):
    #this should only happen when we process the first file ever
    if filepath is None:
        return {},None,None,None
    
    info = datafiles.read_info_file(filepath)
    if os.path.basename(filepath).startswith("yearn_archive.txt"):
        backfill = 1
        archive = True
    elif info['date_last_absent'] is None:
        timestamp = util.convert_date_to_millis(info['date_modified'])
        backfill = 1
        archive = False
    else:
        timestamp = util.convert_date_to_millis(info['date_first_present'])
        backfill = 0
        archive = False
    
    file = open(filepath, "r")
    data={}
    
    for line in file:
        line = line.rstrip("\n")
        
        # Parse date
        # XXX all dates need to be in UTC based on exchange of stock
        annDate, name, ticker, value, time = line.split("\t")
        if time == 'Time Not Supplied':
            exactAnnDate = annDate + ' 00:00 UTC'
        elif time == 'Before Market Open':
            exactAnnDate = annDate + ' 08:00 EST'
        elif time == 'After Market Close':
            exactAnnDate = annDate + ' 17:00 EST'
        else:
            exactAnnDate = annDate +" "+ time.replace("ET", "EST")
        
        #annDate to millis
        try:
            exactAnnDate = util.convert_date_to_millis(exactAnnDate)
        except:
            util.warning("Failed to parse {}".format(exactAnnDate))
            print "Failed to parse {}".format(exactAnnDate)
            continue
        if archive:
            timestamp = util.convert_date_to_millis(annDate) - util.convert_date_to_millis(datetime.timedelta(days=30))
        
        secid = database.getSecidFromXref("TIC", ticker, timestamp, "compustat_idhist", newdb.xrefsolve.preferUS)
        if secid is None:
            util.warning("Failed to map ticker {}".format(ticker))
            continue

        coid, issueid = database.getCsidFromSecid(secid)
        assert coid is not None
    
        data[(coid,exactAnnDate,backfill)]=annDate
        #data.append((coid,exactAnnDate,backfill,timestamp))
    
    file.close()
        
    #get the file start date from the filename
    if not archive:
        startDate=os.path.normpath(filepath).split("/")[-1][0:8] #split the filepath, take last token and its first 8 chars
    else:
        startDate="20060101"
            
    return (data,archive,startDate,timestamp)
Ejemplo n.º 11
0
def __processPush(filepath, source):
    #get the date the data are about
    date = util.convert_date_to_millis(filepath[-21:-13])

    fileInfo = datafiles.read_info_file(filepath)
    if fileInfo["date_last_absent"] is None:
        backfill = 1
        timestamp = date
    else:
        backfill = 0
        timestamp = util.convert_date_to_millis(fileInfo["date_first_present"])

    file = open(filepath, "r")

    #make a first pass and collect the data
    data = []
    counter = Counter()
    for line in file:
        tokens = line.strip().split(",")
        type = tokens[4]

        if type != 'R':
            util.error("Strange line in availability push file " + line)
            continue

        cusip = tokens[1]
        quantity = float(tokens[2])
        rate = float(tokens[3])

        data.append((cusip, quantity, rate))
        counter[rate] += 1

    #get the mode (most frequent) of the rates
    rateModes = counter.most_common(2)
    #assert that the most frequent rate lies at the left of the second most frequent, i.e., the mode corresponds to the "base" borrow rate
    assert rateModes[0][0] > rateModes[1][0]
    rateMode = rateModes[0][0]

    #insert the data
    failure = 0
    for datum in data:
        cusip = datum[0]
        quantity = datum[1]
        rateDiff = datum[2] - rateMode

        secid = database.getSecidFromXref("CUSIP", cusip, timestamp,
                                          "compustat_idhist",
                                          newdb.xrefsolve.preferUS)
        if secid is None:
            failure += 1
            util.warning("Failed to map CUSIP {}. Failure #{}".format(
                cusip, failure))
            continue

        if rateDiff > 0:
            util.error(
                "Positive rate for {}: Rate={}, Mode={}, Diff={}".format(
                    cusip, datum[2], rateMode, rateDiff))
        elif rateDiff == 0:
            pass
        else:
            database.insertAttribute("sec", "n", secid, date, source,
                                     "BORROW_RATE_PUSHED", rateDiff, timestamp,
                                     None, backfill, False, False,
                                     util.dict_fields_eq_num_stable)

        database.insertAttribute("sec", "n", secid, date, source,
                                 "BORROW_AVAILABILITY", quantity, timestamp,
                                 None, backfill, False, False,
                                 util.dict_fields_eq_num_stable)

    file.close()
Ejemplo n.º 12
0
def __processRequest(filepath, source):
    #get the date the data are about
    date = util.convert_date_to_millis(filepath[-21:-13])

    fileInfo = datafiles.read_info_file(filepath)
    if fileInfo["date_last_absent"] is None:
        backfill = 1
        timestamp = date
    else:
        backfill = 0
        timestamp = util.convert_date_to_millis(fileInfo["date_first_present"])

    file = open(filepath, "r")

    #make a first pass and collect the data
    data = []
    counter = Counter()
    for line in file:
        tokens = line.strip().split(",")
        ticker = tokens[0]
        requested = float(tokens[2])
        allocated = float(tokens[3])
        #notes=tokens[4]
        if len(tokens) > 5:
            rate = float(tokens[5])
            type = tokens[6]
        else:
            rate = None
            type = None

        assert type is None or type == "R"
        data.append((ticker, requested, allocated, rate))
        counter[rate] += 1

    #get the mode (most frequent) of the rates
    rateModes = counter.most_common(2)
    #assert that the most frequent rate lies at the left of the second most frequent, i.e., the mode corresponds to the "base" borrow rate
    assert rateModes[0][0] > rateModes[1][0]
    rateMode = rateModes[0][0]

    #insert the data
    failure = 0
    for datum in data:
        ticker = datum[0]
        requested = datum[1]
        allocated = datum[2]
        rateDiff = datum[3] - rateMode if datum[3] is not None else None

        secid = database.getSecidFromXref("TIC", ticker, timestamp,
                                          "compustat_idhist",
                                          newdb.xrefsolve.preferUS)
        if secid is None:
            failure += 1
            util.warning("Failed to map TICKER {}. Failure #{}".format(
                ticker, failure))
            continue

        if rateDiff > 0:
            util.error(
                "Positive rate for {}: Rate={}, Mode={}, Diff={}".format(
                    ticker, datum[2], rateMode, rateDiff))
        elif rateDiff == 0:
            pass
        else:
            database.insertAttribute("sec", "n", secid, date, source,
                                     "BORROW_RATE", rateDiff, timestamp, None,
                                     backfill, False, False,
                                     util.dict_fields_eq_num_stable)

        database.insertAttribute("sec", "n", secid, date, source,
                                 "BORROW_REQUESTED", requested, timestamp,
                                 None, backfill, False, False,
                                 util.dict_fields_eq_num_stable)
        database.insertAttribute("sec", "n", secid, date, source,
                                 "BORROW_ALLOCATED", allocated, timestamp,
                                 None, backfill, False, False,
                                 util.dict_fields_eq_num_stable)

    file.close()
Ejemplo n.º 13
0
def process(filePath, source, verifyOnly=False):
    #process the RSK files for now
    if filePath.find(".RSK.") < 0:
        return

    file = open(filePath, "r")

    #The first 2 lines should be the pricedate and the modeldate for daily files
    #For the monthly files it is just the model date

    #check if it is a daily file or a monthly file. Check if the first line contains PriceDate
    firstLine = file.readline()
    if "PriceDate" in firstLine:
        daily = True
        file.seek(0)  #get to the first line again

        tokens = file.readline().strip().split(":")
        if tokens[0] != "PriceDate":
            util.error("It doesn't seem like a barra daily format")
            raise Exception
        else:
            priceDate = __barraDateToCompact(tokens[1].strip())

        tokens = file.readline().strip().split(":")
        if tokens[0] != "ModelDate":
            util.error("It doesn't seem like a barra daily format")
            raise Exception
        else:
            modelDate = __barraDateToCompact(tokens[1].strip())
    else:
        daily = False
        file.seek(0)  #get to the first line again

        token = file.readline().strip()
        priceDate = __barraDateToCompact(token)
        modelDate = __barraDateToCompact(token)

    # If we have acquisition times, use these for real born time.
    # Else, use the priceDate + 1 day
    fileInfo = datafiles.read_info_file(filePath)
    if fileInfo['date_last_absent'] is not None:
        timestamp = util.convert_date_to_millis(fileInfo['date_first_present'])
        backfill = 0
    else:
        if daily:
            date = priceDate + datetime.timedelta(days=1)
        else:
            date = priceDate + datetime.timedelta(days=2)
        timestamp = util.convert_date_to_millis(date.strftime("%Y%m%d"))
        backfill = 1

    database.setAttributeAutoCreate(True)

    priceDate = util.convert_date_to_millis(priceDate)
    modelDate = util.convert_date_to_millis(modelDate)

    #get the header names. comma separated, surrounded by double quotes
    line = file.readline()
    headers = __getListFromBarraLine(line)

    for line in file:
        data = __getListFromBarraLine(line)

        if len(data) != len(headers):
            util.warning("Skipping bad line: {}".format(line))
            continue

        data = dict(zip(headers, data))

        barraid = data["BARRID"]
        cusip = util.cusip8to9(data["CUSIP"])
        #updateBarraRef(barraid, cusip, timestamp, False)
        updateBarraRef(source, barraid, cusip, priceDate, True)

        #Now, insert barra attributes and attribute values
        __removeUnwantedAttributes(data)
        for attributeName, attributeValue in data.iteritems():
            if isinstance(attributeValue, str):
                table = "s"
            elif isinstance(attributeValue, int):
                table = "n"
            elif isinstance(attributeValue, float):
                table = "n"
            else:
                util.error(
                    "Dude, attribute values should be either int,float or str")
                raise

            #With the exeption of capitalization and price, the other barra attributes
            #are attributes that are evaluated monthly. for them, the date should be the
            #model date. price we ignore, while capitatlization, we only create a new tuple
            #if the capitalization has changed more than a threshould since the last date
            #for which we have a tuple
            if attributeName == "PRICE":
                continue
            elif attributeName == "CAPITALIZATION":
                insertBarraAttribute("n", barraid, priceDate, source,
                                     attributeName, attributeValue, timestamp,
                                     backfill, True, __capEquals)
            elif attributeName in ("TICKER", "CUSIP", "NAME"):
                #protect against crappy names:
                if attributeName == "NAME":
                    attributeValue = __printableString(attributeValue)
                insertBarraAttribute("s", barraid, priceDate, source,
                                     attributeName, attributeValue, timestamp,
                                     backfill, True)
            else:
                insertBarraAttribute(table, barraid, modelDate, source,
                                     attributeName, attributeValue, timestamp,
                                     backfill)

    file.close()
Ejemplo n.º 14
0
def process(filepath, source):
    info = datafiles.read_info_file(filepath)
    born_millis = util.convert_date_to_millis(info['date_first_present'])
    #db.insert_checks(next=True, prev=True)

    database.setAttributeAutoCreate(True)

    f = file(filepath, 'r')
    for line in f.readlines():
        line = line.rstrip("\n")

        # Parse story
        story = line.split("|")
        #secs = int(story[0][1:])
        #num = int(story[1])
        time = story[2][0:9]
        text = story[2][10:]

        local_date = info['date_first_present'].astimezone(
            pytz.timezone('US/Eastern'))
        date = dateutil.parser.parse(str(local_date)[0:11] + time)
        date_millis = util.convert_date_to_millis(date)

        sep = text.find(" - ")
        if sep == -1:
            sep = len(text)
        headline = text[0:sep]
        #body = text[sep+3:]

        category = story[3]
        tickers = story[4].split(";")

        # clean some crap out
        headline = headline.replace("'", "")
        headline = headline.replace("\"", "")
        headline = headline.replace("  ", " ")

        if category == 'Rec-Upgrade' or category == 'Rec-Downgrade':
            for i, ticker in enumerate(tickers):
                handle_news(ticker, 'FLY2', date_millis, i, born_millis)
            if category == 'Rec-Upgrade':
                value = 1
            else:
                value = -1
            handle_news(tickers[0], 'FRATING', date_millis, value, born_millis)
            if len(tickers) > 1 and re.match(".+ (not|Not|NOT) .+",
                                             headline) != None:
                handle_news(tickers[1], 'FRATING', date_millis, -1 * value,
                            born_millis)

        elif category == "Rec-Initiate":
            for i, ticker in enumerate(tickers):
                handle_news(ticker, 'FLY2', date_millis, i, born_millis)
            value = None
            if re.match(
                    ".+ (Above Average|Accumulate|Outperform|Buy|Overweight).*",
                    headline):
                value = 1
            elif re.match(".+ (Below Average|Underperform|Sell|Underweight).*",
                          headline):
                value = -1
            elif re.match(".+ (In Line|Perform|Neutral|Hold|Equal Weight).*",
                          headline):
                value = 0
            if value is not None:
                handle_news(tickers[0], 'FRATING', date_millis, value,
                            born_millis)
            else:
                util.warning('unmatched rec initiate')
                util.warning(headline + " " + str(tickers))

        elif category == 'Rumors':
            for i, ticker in enumerate(tickers):
                handle_news(ticker, 'FLY1', date_millis, i, born_millis)
                handle_news(ticker, 'FLY2', date_millis, i, born_millis)
            value = None
            if re.match(
                    ".+ (climbs|rallies|jumps|moving higher|raises on|shares rise|movers higher|moves higher|moves off lows|moves up|shares active|ticks up|ticks higher|strength attributed to|up on|trades higher|trades up|spikes higher|moves to positive territory|spikes|begins to move higher|lifts|continues to rise|moves positive).*",
                    headline):
                value = 1
            elif re.match(".+ (weakness attributed to|moves lower|drops on).*",
                          headline):
                value = -1
            if value is not None:
                handle_news(tickers[0], 'FRUMOR', date_millis, value,
                            born_millis)
            else:
                util.warning('unmatched rumor')
                util.warning(headline + " " + str(tickers))

        elif category == 'Hot Stocks':
            for i, ticker in enumerate(tickers):
                handle_news(ticker, 'FLY1', date_millis, i, born_millis)
                handle_news(ticker, 'FLY2', date_millis, i, born_millis)
            value = None
            if re.match(
                    ".+ (recieve[sd]?|receive[sd]?|issued) .*?(SEC|warning|subpoena|deficiency|delisting|non-?compliance).*",
                    headline):
                value = -1
            elif re.match(
                    ".+ (achieves?|granted|awarded|secures?|renews?|receive[sd]?|granted|expands?|wins?|recieve[sd]?|issues?|issued|presents?|obtains?|announces?|signs?|acquires?|enters?|initiates?|completes?) .*?(rights?|discovery|discovers|awarded|partnerships?|collaborations?|enrollment|agreements?|strategic partner|alliances?|expanded|license|proposals?|permits?|trials?|authorization|availability|certifications?|favorable|data|CE mark|investments?|payments?|extensions?|milestones?|allowances?|accreditations?|(new.*? business)|(oil|gas) reserves|grants?|FDA (priority|approval)|proceeds|royalty|royalties|SPA|([Cc]learance)|waiver|commitments|positive|patents?|contracts?|projects?|deal|orders?|(in (.+)? case)|design|progress|program|assignment|option|approval|settlement|permission|promising|significantly improved|launch|regains|unsolicited offer).*",
                    headline):
                value = 1
            elif re.match(
                    ".+ (to raise|raises|increases|to increase|initiates|raising|declares?|delcares?) .*dividend.*",
                    headline):
                value = 1
            elif re.match(
                    ".+ (cuts|to cut|to lower|lowers|decreases|suspends|plans to suspend|lowering) .*dividend.*",
                    headline):
                value = -1
            elif re.match(".+ (acquires|raises|acquired) .*stake.*", headline):
                value = 1
            elif re.match(
                    ".+ (lowers|liquidn?ates|sell|sells|considering selling|sold) .*stake.*",
                    headline):
                value = -1
            elif re.match(
                    ".+ (recall(s?|ing|ed)|discontinu(ing|ed|es?)|lays off|questions efficacy|announces salary reductions|announces (possible )?compromise|lowers guidance|conditions to worsen|sees (.+)?revenue decline|to layoff|not confident|capacity down|(plummets?|sinks?|drops?|moves? lower|falls?|tumbles?|retreats?) (.+)?(after|following|on)|(to reduce|reduced) (distribution|workforce)|reductions (have been|will be) (implemented|likely)|enters into lease termination|loses to|sales (down|trends? worsens?|decreased)|(credit|ratings?) (may get )?(downgraded|lowered)|downgrades|to (cut|eliminate) ((approximately|roughly) )?(%s )?jobs|to stop offering|pullback in demand|curtails production|not in compliance|takes action against|injunction restrains|(Nasdaq|NASDAQ) (notice|notification)|(notice|notification) from (Nasdaq|NASDAQ)|losses|damaged|misses|lawsuit|fraud|halts).*"
                    % number, headline):
                value = -1
            elif re.match(
                    "(.+)?(launches new|expects increased demand|raises %s|resum(ed|es?|ing)|licenses? out|licenses technology|delivers|begins delivery|settles? (.+)?litigation|increases (.+)?distribution|raises guidance|approached by potential buyers|removed from CreditWatch|sales up|sales trends (.+)?improve|successfully|could expand|rules in favor for|expects .+ orders|confident|closer to Phase|remains on track|on track to|to manufacture|expects (.+)?to improve|expects strong cash flow|expects production to increase|reports? positive|reports? preliminary data|receives offer|expenses to decline|says .+ now available in|expands? distribution|selected by|selected for|sales increased|will improve|positioned for (.+)?recovery|performance strong|(credit|ratings?) (increased|raised|upgraded)|prepared to weather|continues to increase output|expanding capacity|order (.+)?delivered|(rises?|raises?|gains?|spikes?|advances?|rallies?|soars?|surges?|climbs?|trades? higher) (.+)?(on|following|after)|deploys|to deploy|provides|to provide|extend development|FDA approves|to recognize %s gain|buys %s shares|invests in second phase|shares rise|reaches agreement|sees growth|adds significant production).*"
                    % (number, number, number), headline):
                value = 1
            if value is not None:
                handle_news(tickers[0], 'FHOT', date_millis, value,
                            born_millis)
            else:
                util.warning('unmatched hot stocks')
                util.warning(headline + " " + str(tickers))

        elif category == "Recommendations":
            for i, ticker in enumerate(tickers):
                handle_news(ticker, 'FLY1', date_millis, i, born_millis)
                handle_news(ticker, 'FLY2', date_millis, i, born_millis)
            if headline.find("price target to") != -1:
                m = re.match(
                    ".+ price target to (?P<target>%s) from (?P<oldtgt>%s) at .+"
                    % (number, number), headline)
                if m is not None:
                    gd = m.groupdict()
                    target = normalize_number(gd['target'])
                    oldtgt = normalize_number(gd['oldtgt'])
                    if target > oldtgt:
                        value = 1
                    elif target < oldtgt:
                        value = -1
                    else:
                        value = 0
                    handle_news(tickers[0], 'FREC', date_millis, value,
                                born_millis)
            else:
                value = None
                if re.match(
                        ".+ (raises price target|is a good deal|underappreciated|likely (to )?be approved|should (grow|move higher)|momentum is continuing|weakness is an overreaction|move is positive|reported (solid|excellent)|pursuing correct|outlook remains positive|will be helped|[cs]hould be better than expected|valuation compelling|has been (very )?positive|should stay strong|are top ideas|checks indicate healthy|should benefit|recommends a long|fundamentals still solid|well-positioned to (outperform|benefit)|shares oversold|should be bought|creates (a )?buying opportunity|a (strong )?buying opportunity|highly attractive|should sell better|problem is fixable|down on misguided|sell-off is overdone|positive news|can achieve|(is|are) strong|outlook (is )?boosted|guidance (is )?(likely )?conservative|should gain|reiterated (Outperform|Buy)|should be owned|poised|be able to overcome|has (good|best) prospects|significantly undervalued|added to Top Picks|remains? undervalued|results bode well|upgraded|valuation (is )?(still )?(remains )?attractive|attractively valued|raise is likely|added to (short[- ]term )?buy list|added to .+ List|added to .+ as a buy|shares defended at|should report (strong|better|stronger|solid)|margins strong|continue to generate (strong )?growth|(shown|shows) signs of improvement|estimates raised|strategy worked|results will likely be solid|named a long|weakness a buying opportunity|risk/reward (ratio )?(is )?(attractive|positive|favorable)|upgraded|mentioned positively|target raised|supports approval|has an approvable|still approvable).*",
                        headline):
                    value = 1
                elif re.match(
                        ".+ (target cut|reiterated Sell|should report weak(er)?|shares likely to be weak|growth seems to be slowing|estimates (reduced|trimmed)|fundamentals do not support|(will|appears to) be hurt|should be sold|valuations? (is )?(still )?(remains )?unattractive|(should|will) encounter more competition|expectations could be aggressive|remains overvalued|indicate slowing|likely to lose|faces risk|should report (.+)?weaker|will face (.+)?slowdown|sales ((appear to be|are) )?deteriorating|downgraded|estimates lowered|removed from .+ List|removed from Top Picks|still likely to fail|likely to stimulate fear|target lowered|a Sell at|lowers estimates|removed from (short[- ]term )?buy list).*",
                        headline):
                    value = -1
                if value is not None:
                    handle_news(tickers[0], 'FREC', date_millis, value,
                                born_millis)
                else:
                    util.warning('unmatched recommendations')
                    util.warning(headline + " " + str(tickers))

        elif category == 'Options':
            for i, ticker in enumerate(tickers):
                handle_news(ticker, 'FLY1', date_millis, i, born_millis)
                handle_news(ticker, 'FLY2', date_millis, i, born_millis)
            value = None
            if re.match(".+ puts? (options )?(more )?active.*", headline):
                value = -1
            elif re.match(".+ calls? (options )?(more )?active.*", headline):
                value = 1
            if value is not None:
                handle_news(tickers[0], 'FOPTION', date_millis, value,
                            born_millis)
            else:
                util.warning('unmatched options')
                util.warning(headline + " " + str(tickers))

        elif category == 'Earnings':
            for i, ticker in enumerate(tickers):
                handle_news(ticker, 'FLY1', date_millis, i, born_millis)
                handle_news(ticker, 'FLY2', date_millis, i, born_millis)
            headline = headline.replace("break-even", "breakeven")
            headline = headline.replace("break even", "breakeven")
            if headline.find("consensus") != -1:
                m = re.match(
                    ".+? (?P<reported>(%s(( to )|-))?%s) .*consensus.* (?P<consensus>%s)"
                    % (number, number, number), headline)
                if m is not None:
                    gd = m.groupdict()
                    cons = normalize_number(gd['consensus'])
                    value = None
                    gd['reported'] = gd['reported'].replace("-", " to ")
                    if gd['reported'].find(" to ") != -1:
                        rvalues = gd['reported'].split(" to ")
                        rvalues[0] = normalize_number(rvalues[0])
                        rvalues[1] = normalize_number(rvalues[1])
                        replb = min(rvalues[0], rvalues[1])
                        repub = max(rvalues[0], rvalues[1])
                        if repub < cons:
                            value = -1
                        if replb > cons:
                            value = 1
                        else:
                            value = 0
                    else:
                        rvalue = normalize_number(gd['reported'])
                        if rvalue < cons:
                            value = -1
                        elif rvalue > cons:
                            value = 1
                        else:
                            value = 0
                    handle_news(tickers[0], 'FEARN', date_millis, value,
                                born_millis)
                else:
                    if re.match(
                            ".+ (above|will exceed|should meet or beat|at least meet) .*consensus.*",
                            headline) is not None:
                        handle_news(tickers[0], 'FEARN', date_millis, 1,
                                    born_millis)
                    elif re.match(
                            ".+ (below|not expected to meet) .*consensus.*",
                            headline) is not None:
                        handle_news(tickers[0], 'FEARN', date_millis, -1,
                                    born_millis)
                    else:
                        util.warning('unmatched consensus')
                        util.warning(headline + " " + str(tickers))

        elif category == 'Technical Analysis':
            pass

        elif category == 'Conference/Events':
            pass

        elif category == 'General news':
            pass

        elif category == 'Periodicals':
            pass

        elif category == 'Syndicate':
            value = None
            if re.match(".+ ([Tt]o [Ss]ell) .+", headline):
                value = -1
            if value is not None:
                handle_news(tickers[0], 'FSYND', date_millis, value,
                            born_millis)
            else:
                util.warning('unmatched syndicate')
                util.warning(headline + " " + str(tickers))

        else:
            util.warning('unknown category')
            util.warning(category + " " + headline + " " + str(tickers))