def _find_glabels(asm): glabels = [] for file in FileUtil.get_filenames_from_directory_recursive('.', '.s'): contents = FileUtil.get_text_from_file(file) matches = GLABEL_REGEX.findall(contents) for match in matches: reg = match[3] offsets = (match[2], match[10] ) # ROM offsets of the lui and [sl][bhw] glabel_upper = int(match[5], 16) # upper immediate of glabel lower_group = 15 if match[ 15] else 18 # lower immediate of the glabel is_addu = lower_group == 18 glabel_lower = int(match[lower_group], 16) glabel_lower &= 0xFFFF if glabel_lower & 0x8000: glabel_upper -= 1 glabel = 'D_%08X' % (glabel_upper << 16 | glabel_lower) if glabel not in IGNORE_GLABELS: glabels.append((glabel, *offsets)) # replace upper instruction contents = re.sub(UPPER_INSTR_REGEX_TMPL % offsets[0], r'\1%s, %%hi(%s) # \2' % (reg, glabel), contents) # replace lower instruction contents = re.sub(LOWER_INSTR_LS_REGEX_TMPL % offsets[1], r'\1%%lo(%s)\5' % glabel, contents) contents = re.sub( LOWER_INSTR_ADDU_SAME_REGEX_TMPL % offsets[1], r'\1addiu \3, %%lo(%s) # \2' % glabel, contents) contents = re.sub(LOWER_INSTR_ADDU_REGEX_TMPL % offsets[1], r'\1%%lo(%s)' % glabel, contents) if len(matches): FileUtil.write_text_to_file(file, contents) return glabels
def finish(self): if not FileUtil.interview_exists(): print('Please change to the root of the exam directory, then execute this command again.') exit(-1) # do not trust existing data, retrieve interview data from server again interview = FileUtil.read_interview('.') self.cerf_api = Cerf(interview['id'], interview['authcode']) interview = self.cerf_api.interview.retrieve(interview['id']) self.load_data(interview) if interview['time_spent']: print('Your exam is over. Please stay tuned.') exit(-1) spent = calc_time_spent(interview['started']) print('Thank you! Your exam is done! Total time spent: %d minutes.' % spent) print('Submitting your code to generate report...') self.submit_cases() print('Done!') print('Notifying the hiring manager...'), self.finish_interview() print('Done!') print('Please wait for a short moment. If no one comes in 5m, please inform frontdesk.')
def _log_glabel_usage(glabels): """ Returns: usage: A sorted map from glabel names to a sorted list of all the ROM addresses it is accessed from. c_file_offsets: A list of (filename, ROM offset) tuples from all the c files used. Parameters: glabels: output from _get_glabels. """ usage = OrderedDict([(glabel, set()) for glabel in glabels]) files = FileUtil.get_filenames_from_directory_recursive('.', ('.c', '.s')) c_file_offsets = [] for file in files: contents = FileUtil.get_text_from_file(file) try: offset = _get_file_offset(file, contents) if file.endswith('.c'): c_file_offsets.append((file, offset)) matches = re.findall(GLABEL_REGEX, contents) for glabel in matches: if glabel in usage: usage[glabel].add(offset) except: pass for glabel in usage: usage[glabel] = sorted(list(usage[glabel])) c_file_offsets.sort(key=lambda f: f[1]) return usage, c_file_offsets
def main(): FileUtil.set_working_dir_to_project_base() contents = FileUtil.get_text_from_file( 'asm/non_matchings/unknown_005740/func_80005254.s') glabels = _find_glabels(contents) print('%d undeclared labels found:' % len(glabels)) for glabel in glabels: print(glabel)
def test_get_html(self): """ Tests get_html method """ file_util = FileUtil() expected_html = file_util.get_file_contents("example.html") html_requester = HtmlRequester() actual_html = html_requester.get_html("http://example.org") self.assertEqual(expected_html, actual_html)
def test_get_links(self): """ Tests get_links method """ file_util = FileUtil() expected_links = file_util.get_file_contents("links_test_data.txt") html_parser = HtmlParser() html_test_data = file_util.get_file_contents("html_test_data.html") actual_links = html_parser.get_links(html_test_data) self.assertEqual(expected_links, actual_links)
def test_get_web_pages(self): """ Tests get_web_pages method """ file_util = FileUtil() expected_web_pages = file_util.get_file_contents("web_pages_test_data.txt") html_parser = HtmlParser() same_hostname_urls = file_util.get_file_contents("same_hostname_urls_test_data.txt") actual_web_pages = html_parser.get_web_pages(same_hostname_urls) self.assertEqual(expected_web_pages, actual_web_pages)
def find_and_rename(directory, paths): for name in paths: path = directory + '/' + name if oldSymbol in path: newPath = path.replace(oldSymbol, newSymbol) try: FileUtil.rename_file(path, newPath) print('Renamed "' + path + '" to "' + newPath + '"') except FileNotFoundError: pass
def mock_get_html(self, url): """ Mocks the get_html method of the html_requester class to return the contents of html_test_data.html This mocking allows for inputting test html data without having to host it online. """ if url == "http://www.domain.com": file_util = FileUtil() html_test_data = file_util.get_file_contents("html_test_data.html") return html_test_data else: return ""
def test_get_same_hostname_urls(self): """ Tests get_same_hostname_urls method """ file_util = FileUtil() expected_same_hostname_urls = file_util.get_file_contents("same_hostname_urls_test_data.txt") html_parser = HtmlParser() hostname = "http://www.domain.com/" links = file_util.get_file_contents("links_test_data.txt") actual_same_hostname_urls = html_parser.get_same_hostname_urls(hostname, links) self.assertEqual(expected_same_hostname_urls, actual_same_hostname_urls)
def load_data(path, dsetname=''): util = FileUtil(path) util.walk() for key in util.datums: if(len(dsetname) > 0): fname = util.datums[key].split('/')[-1] if(fname != dsetname): continue print('Load ', key) data = pd.read_csv(util.datums[key], index_col="record_id") write_meta(list(data.columns.values),key) return data
def link_features(path): # Load the datasets dataframes util = FileUtil(path) util.walk() for key in util.datums: fname = util.datums[key].split('/')[-1] if ('.features.csv' in fname): infile = util.datums[key] outfile = '/pfs/out/' + fname try: os.symlink(infile, outfile) except: print('Cannot create sim-link', infile, outfile)
def __init__(self, filename): with open(filename, 'rb') as romFile: self.fixedRomEndianess = False self.bytearray = romFile.read() self.bytes = list(self.bytearray) self.size = len(self.bytes) self._test_endianness() self.md5 = hashlib.md5(self.bytearray).hexdigest() if self.fixedRomEndianess: FileUtil.delete_file(filename) # Save the ROM as big-endian with open(filename[:-4] + '.z64', 'wb') as romFile: romFile.write(self.bytearray)
def main(): FileUtil.set_working_dir_to_project_base() if len(sys.argv) != 2: show_help() return symbol, address = find_pairing(sys.argv[1]) if symbol is None or address is None: if is_address(sys.argv[1]): print('No symbol was found for the address 0x%08X' % int(sys.argv[1], 16)) else: print('No address was found for the symbol "%s"' % sys.argv[1]) else: print('0x%08X = %s' % (address, symbol))
def main(): FileUtil.set_working_dir_to_project_base() data_glabels, rodata_glabels, bss_glabels = _get_glabels() for section in [('.data', data_glabels), ('.rodata', rodata_glabels), ('.bss', bss_glabels)]: glabels = section[1] usage, c_file_offsets = _log_glabel_usage(glabels) filtered_usage = _filter_glabel_usage(usage) file_splits = _split_glabel_files(filtered_usage, c_file_offsets) print('File splits for %s:' % section[0]) for split in file_splits: print('%s (%06X): %s' % split) print()
def test_crawl(self): """ Tests crawl method The get_html method of the html_requester class is mocked to return the contents of html_test_data.html. This mocking allows for inputting test html data without having to host it online. """ file_util = FileUtil() expected_result = file_util.get_file_contents("crawl_test_data.txt") web_crawler = WebCrawler() web_crawler.html_requester.get_html = lambda url: self.mock_get_html( url) actual_result = web_crawler.crawl("http://www.domain.com") self.assertEqual(expected_result, actual_result)
def loadAllFromFiles() -> Dict[str, List[Dict[str, Any]]]: return { setName: FileUtil.getJSONContents( f"{SetUtil.CARDS_DIR}{os.sep}{sanitize(setName)}" ) for setName in SetUtil.sets }
def load_data(path, dsetname=''): util = FileUtil(path) util.walk() for key in util.datums: if (len(dsetname) > 0): fname = util.datums[key].split('/')[-1] if (dsetname in fname): print('Load ', key) #data = pd.read_csv(util.datums[key], # header=None) #write_meta(list(data.columns.values),key) copyfile(util.datums[key], ) else: print(dsetname, key) return data
def main(): guides = FileUtil.getJSONContents('guides.json') if (guides == None): exit(1) successes = [] failures = [] # Tag yourself for g in guides: if (genGuide(guides[g])): successes.append(g) else: failures.append(g) print() if (len(successes) > 0): print('\n%d succssful generation(s):' % len(successes)) for s in successes: print('\t%s' % s) else: print('0 successful generations') print() if (len(failures) > 0): print('%d failed generation(s):' % len(failures)) for f in failures: print('\t%s' % f) else: print('0 failed generations') print()
def main(): guides = FileUtil.getJSONContents('guides.json') if (guides == None): exit(1) validGuides = [] invalidGuides = [] for g in guides: if (validateGuide(guides[g])): validGuides.append(g) else: invalidGuides.append(g) print() if (len(validGuides) > 0): print('\n%d valid guide(s):' % len(validGuides)) for vg in validGuides: print('\t%s' % vg) else: print('0 valid guides') print() if (len(invalidGuides) > 0): print('%d invalid guide(s):' % len(invalidGuides)) for ivg in invalidGuides: print('\t%s' % ivg) else: print('0 invalid guides') print()
def submit_cases(self): path = os.getcwd() for root, dirs, files in os.walk('.'): for d in dirs: if d.startswith('case'): config = FileUtil.read_case(os.path.join(path, d)) self.submit_case(config)
def __read_config(): base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) settings_file = os.path.join( os.path.join(os.path.realpath(base_dir), "config"), "settings.yml") # 全局配置文件的绝对路径 account_file = os.path.join( os.path.join(os.path.realpath(base_dir), "config"), "account.yml") db_file = os.path.join(os.path.join(os.path.realpath(base_dir), "config"), "db.yml") settings = FileUtil().connect_to(settings_file).parsed_data accounts = FileUtil().connect_to(account_file).parsed_data db = FileUtil().connect_to(db_file).parsed_data ENV = settings["env"] LOG_SWITCH = settings["log_switch"] LOG_LEVEL = settings["log_level"] PRINT_SWITCH = settings["print_switch"] return ENV, LOG_SWITCH, accounts, db, LOG_LEVEL, PRINT_SWITCH
def validate_meta(path): util = FileUtil(path) util.walk() meta = [] for key in util.datums: parts = key.split('.') for part in parts: if part != 'meta': continue with open(path + '/' + key, 'r') as f: reader = csv.reader(f) if (len(meta) == 0): meta = next(reader) for row in reader: if meta != row: print('Meta data does not match', meta, row) return [] return meta
def get_data(path): util = FileUtil(path) util.walk() data = [] name = '' for key in util.datums: print(key, util.datums[key]) name = name + key with open(util.datums[key], 'r') as datum: reader = csv.reader(datum) for row in reader: data.append(row) if(len(data) > 0): result = merge_data(data) write(name, result) else: print('No datums collected')
def init(): """ Right now, this just gets the bearer token. """ # Try getting the cached bearer token # TODO Factor this out into an abstract base class tokenFP = f"{FileUtil.TOKENS_FOLDER_PATH}{os.sep}{TCGPlayerAPI.KEY_NAME}.json" currTime = int(time.time()) try: tokenInfo = FileUtil.getJSONContents(tokenFP)['bearer_token'] tokenValue = tokenInfo['value'] expireTime = tokenInfo['expires'] if (currTime + 180) >= expireTime: TCGPlayerAPI.bearerToken = tokenValue haveToken = True else: haveToken = False except Exception as e: haveToken = False expireTime = currTime - 1 if haveToken: print('Using cached bearer token for %s' % TCGPlayerAPI.KEY_NAME) else: print('Getting new bearer token for %s' % TCGPlayerAPI.KEY_NAME) r = requests.post(TCGPlayerAPI.GET_BEARER_TOKEN_URL, data={ 'grant_type': 'client_credentials', 'client_id': TCGPlayerAPI.keyInfo['public'], 'client_secret': TCGPlayerAPI.keyInfo['private'], }) print(f"Request ({r.status_code}):\n{r.text}") respJSON = json.loads(r.text) # TODO: Make this not clobber everything else in the file # It's fine for now because there's nothing else, though FileUtil.writeJSONContents( tokenFP, { 'bearer_token': { 'value': respJSON['access_token'], 'expires': respJSON['expires_in'] + currTime } }) TCGPlayerAPI.bearerToken = respJSON['access_token']
def setUpClass(cls): token = environ.get('KB_AUTH_TOKEN', None) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('kb_GenomeBrowser'): cls.cfg[nameval[0]] = nameval[1] # Getting username from Auth profile for token authServiceUrl = cls.cfg['auth-service-url'] auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({'token': token, 'user_id': user_id, 'provenance': [ {'service': 'kb_GenomeBrowser', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1}) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL) cls.serviceImpl = kb_GenomeBrowser(cls.cfg) cls.scratch = cls.cfg['scratch'] cls.callback_url = os.environ['SDK_CALLBACK_URL'] suffix = int(time.time() * 1000) wsName = "test_GenomeBrowser_" + str(suffix) ret = cls.wsClient.create_workspace({'workspace': wsName}) # noqa cls.wsName = wsName cls.file_util = FileUtil(wsName, cls.wsURL, cls.callback_url) # Upload genomes base_gbk_file = "data/at_chrom1_section.gbk" gbk_file = os.path.join(cls.scratch, os.path.basename(base_gbk_file)) shutil.copy(base_gbk_file, gbk_file) cls.genome_ref = cls.file_util.load_genbank_file(gbk_file, 'my_test_genome') # get gff file cls.gff_file = cls.file_util.get_gff_file(cls.genome_ref) # get fasta file cls.fasta_file = cls.file_util.get_fasta_file(cls.genome_ref) # Upload reads base_reads_file = "data/extracted_WT_rep1.fastq" reads_file = os.path.join(cls.scratch, os.path.basename(base_reads_file)) shutil.copy(base_reads_file, reads_file) cls.reads_ref = cls.file_util.load_reads_file("illumina", reads_file, None, "my_reads_lib") # Upload alignments base_align_file = "data/at_chr1_wt_rep1_hisat2.bam" cls.bam_file = os.path.join(cls.scratch, os.path.basename(base_align_file)) shutil.copy(base_align_file, cls.bam_file) cls.alignment_ref = cls.file_util.load_bam_file(cls.bam_file, cls.genome_ref, cls.reads_ref, 'my_hisat2_alignment')
def process_image_file(config, file): image = cv2.imread(file) if image is None: print("读取图片:%s失败!") % file return False convertor = ColorConvertorFactory.create_color_convertor(config.method) if convertor is None: return False output_image = convertor.convert(image, config) if output_image is None: return False output_path = ImagesColorProcessor.get_output_path( file, config.input_dir, config.output_dir) if config.overwrite and FileUtil.exists(output_path): FileUtil.delete(output_path) if not cv2.imwrite(output_path, output_image): print("输出图片到:%s失败!") % output_path return False return True
def load_processed(path, dsetname, columns, dtypes): util = FileUtil(path) util.walk() for key in util.datums: if(len(dsetname) > 0): fname = util.datums[key].split('/')[-1] if(fname != dsetname): continue print('Load ', key) data = pd.read_csv(util.datums[key], header=None) data = data.drop(data.columns[1],axis=1) data.columns = columns for key in dtypes: data[key] = data[key].astype(dtypes[key]) data = data.set_index('rec_id') return data
def __init__(self, charInfoPath): """ Slots (currently - adding weapons and trinkets later): 'Back', 'Belt', 'Bracer', 'Chest', 'Feet', 'Gloves', 'Head', 'Legs', 'Neck', 'Ring', 'Shoulder' """ paths = {'allGear': 'AllGear.json', 'trinkets': 'Trinkets.json'} self.charInfo = FileUtil.getJSONContents(charInfoPath) allGearList = FileUtil.getJSONContents(paths['allGear']) allTrinketsList = FileUtil.getJSONContents(paths['trinkets']) # They don't exlicitly say that they're trinkets for trink in allTrinketsList: trink['Slot'] = 'Trinket' # Combine all gear into one list self.allGear = [] self.allGear.extend(allGearList) self.allGear.extend(allTrinketsList) # Then turn that list into a map from name to the piece of gear self.allGear = DataUtil.toMap(self.allGear, 'Name') # Load the current gear into memory self.currentGear = DataUtil.statifyNamedGear( self.charInfo['Current Gear'], self.allGear) # TODO # SEE IF THIS DOESN'T BREAK THINGS LATER ON IN EXECUTION # (Might not be kosher if slotified this early) self.allGear = CalcUtil.slotifyAllGear(self.allGear) # Calculate each piece's DPS for name in self.currentGear: piece = self.currentGear[name] piece['DPS'] = CalcUtil.calcDPS(piece, self.charInfo) # Get some basic overall stats about the current gear self.totalStats = CalcUtil.getTotalStats(self.currentGear, Globals.allStats)
def _get_glabels(): """ Returns all the glabel definitions in the data file, split into .data, .rodata, and .bss. """ data_file = FileUtil.get_text_from_file(DATA_FILE_PATH) glabels = re.findall(GLABEL_DEF_REGEX, data_file) glabels = [glabel for glabel in glabels if glabel not in IGNORE_GLABELS] rodata_idx = glabels.index(RODATA_START) bss_idx = glabels.index(BSS_START) return glabels[:rodata_idx], glabels[rodata_idx:bss_idx], glabels[bss_idx:]
def main(): if not dataPath or (not msgPath): return svn = pysvn.Client() versionStr = "" lenList = len(allPath) for index, tPath in enumerate(allPath): LogList = svn.log(tPath, limit=1) info = LogList[0] versionNum = info.revision.number if index == (lenList - 1): versionStr += str(versionNum) else: versionStr += str(versionNum) versionStr += "." exportPath1 = op.join(op.dirname(dataPath), "version") exportPath = op.join(exportPath1, 'version.txt') f = open(exportPath, 'w') f.write(versionStr) f.close() SvnCmd(path=exportPath, cmd='commit', logmsg='Commit version text.{}||{}'.format(exportPath1, SERV_VERSION_DIR)).Run() # copy version 到 trunk SvnCmd(path=SERV_VERSION_DIR, cmd='update', logmsg='update version files.').Run() update_files_version, add_files_version = FileUtil.copy( exportPath1, SERV_VERSION_DIR, suffixes=('.txt', ), force=True) # print("{} .txt files {}".format("#" * 16, "#" * 16)) # print("updated:") # # prettyOutput(update_files_version) # print('') if add_files_version: print("added:") prettyOutput(add_files_version) for new_file in add_files_version: dst_file = op.join(SERV_VERSION_DIR, op.basename(new_file)) SvnCmd(path=dst_file, cmd='add', logmsg='add new config data files').Run() SvnCmd(path=SERV_VERSION_DIR, cmd='commit', logmsg='commit version files').Run()
def calculate_matches(): global mapFile, RAM_TO_ROM REGEX_MAP_GET_LABEL = r"[ ]*?0x[0-9A-Fa-f]{8}([0-9A-Fa-f]{8})[ ]*?([_A-Za-z0-9]+)" mapText = FileUtil.get_text_from_file(MAP_FILEPATH) mapMatches = getMatches(mapText, REGEX_MAP_GET_LABEL) for i in range(0, len(mapMatches) - 1): match = mapMatches[i] labelValue = match[1] value = int(labelValue[0], 16) length = int(mapMatches[i + 1][1][0], 16) - value mapFile[labelValue[1]] = {"value": value, "length": length} RAM_TO_ROM = mapFile['__RAM_TO_ROM']['value']
def submit_case(self, case): path = os.path.join(os.getcwd(), 'case%s' % case['position']) print('\tSubmit case%s...' % case['position']), extentions = [ext.strip() for ext in case['extentions'].split(',')] first_list, second_list = FileUtil.get_valid_files(path, extentions) content = '' for name in first_list + second_list: s = '/* %s */\n\n%s' % (name, FileUtil.read_content(os.path.join(path, name))) content += s data = { 'interview': self.id, 'applicant': self.interview['applicant_id'], 'case': case['cid'], 'content': content } if not self.cerf_api.answer.create(data): print('Cannot submit case%s, please contact your hiring manager.' % case['position']) # do not bail out so that we could try the latter cases. # exit(-1) else: print('Done!')
def __init__(self, old_file, new_file, filename=""): if filename: self.fu = FileUtil(filename) self.fu.get_structure() self.old_file = old_file self.new_file = new_file
class Feature: def __init__(self, old_file, new_file, filename=""): if filename: self.fu = FileUtil(filename) self.fu.get_structure() self.old_file = old_file self.new_file = new_file # self.db = Database(dbname) def save_reviewerid(self): reviewIds = self.fu.get_memberId_list() with open(self.old_file, 'a') as fp: review_txt = "" for review_id in reviewIds: review_txt += review_id + '\n' fp.write(review_txt) # for review_id in reviewIds: # self.db.insert_into_features({'review_id': review_id}) def save_f1(self): features = self.fu.get_feedback_list() review_txt = "" with open(self.old_file) as fp: for index, line in enumerate(fp.readlines()): review_txt += line.replace('\n', '') + '\t' + features[index] + '\n' with open(self.new_file + '1', 'w') as fp: fp.write(review_txt) def save_f2(self): help_features = self.fu.get_help_feedback_list() review_txt = "" with open(self.old_file + '1') as fp: for index, line in enumerate(fp.readlines()): review_txt += line.replace('\n', '') + '\t' + help_features[index] + '\n' with open(self.new_file + '2', 'w') as fp: fp.write(review_txt) def save_f3(self): review_txt = "" with open(self.old_file + '2') as fp: for line in fp.readlines(): features = line.split('\t') if float(features[1]) == 0: review_txt += line.replace('\n', '') + '\t' + '0.0\n' else: review_txt += line.replace('\n', '') + '\t' + str(float(features[2])/float(features[1])) + '\n' with open(self.new_file + '3', 'w') as fp: fp.write(review_txt) def save_f4(self): title_list = self.fu.get_title_list() review_txt = "" with open(self.old_file + '3') as fp: for index, line in enumerate(fp.readlines()): review_txt += line.replace('\n', '') + '\t' + str(len(title_list[index].split(' '))) + '\n' with open(self.new_file + '4', 'w') as fp: fp.write(review_txt) def save_f5(self): content_list = self.fu.get_content_list() review_txt = "" with open(self.old_file + '4') as fp: for index, line in enumerate(fp.readlines()): review_txt += line.replace('\n', '') + '\t' + str(len(content_list[index].split(' '))) + '\n' with open(self.new_file + '5', 'w') as fp: fp.write(review_txt) def save_f6(self): reviewer_product_date_list = self.fu.get_column_list([1,2]) review_txt = "" dict = {} for idx, reviewer_product_date in enumerate(reviewer_product_date_list): product = reviewer_product_date[0] date = reviewer_product_date[1] if product not in dict: dict[product] = {} try: dict[product][idx] = parse(date) except: print date rank_list = rank_dict(dict, False) with open(self.old_file + '5') as fp: for index, line in enumerate(fp.readlines()): review_txt += line.replace('\n', '') + '\t' + str(rank_list[product][index] + 1) + '\n' with open(self.new_file + '6', 'w') as fp: fp.write(review_txt) def save_f7(self): reviewer_product_date_list = self.fu.get_column_list([1,2]) review_txt = "" dict = {} for idx, reviewer_product_date in enumerate(reviewer_product_date_list): product = reviewer_product_date[0] date = reviewer_product_date[1] if product not in dict: dict[product] = {} try: dict[product][idx] = parse(date) except: print date rank_list = rank_dict(dict, True) # with open('review_product_rank', 'w') as fp: # fp.write(str(dict)) with open(self.old_file + '6') as fp: for index, line in enumerate(fp.readlines()): review_txt += line.replace('\n', '') + '\t' + str(rank_list[product][index] + 1) + '\n' with open(self.new_file + '7', 'w') as fp: fp.write(review_txt) def save_f8(self): review_txt = "" with open(self.old_file + '7') as fp: for index, line in enumerate(fp.readlines()): if line.split('\t')[6] == '1': review_txt += line.replace('\n', '') + '\t' + '1\n' else: review_txt += line.replace('\n', '') + '\t' + '0\n' with open(self.new_file + '8', 'w') as fp: fp.write(review_txt) def save_f9(self): review_txt = "" with open(self.old_file + '8') as fp: for index, line in enumerate(fp.readlines()): if line.split('\t')[6] == '1' and line.split('\t')[7] == '1': review_txt += line.replace('\n', '') + '\t' + '1\n' else: review_txt += line.replace('\n', '') + '\t' + '0\n' with open(self.new_file + '9', 'w') as fp: fp.write(review_txt) def save_f10(self): review_txt = "" content_list = self.fu.get_content_list() positive_words = [] with open('../opinion-lexicon-English/positive-words.txt') as fp: positive_words = [word for word in fp.readlines()] with open(self.old_file + '9') as fp: for index, line in enumerate(fp.readlines()): content = content_list[index].split(' ') content_len = len(content) positive_len = 0.0 for word in content: if word.lower() in positive_words: positive_len += 1 print positive_len review_txt += line.replace('\n', '') + '\t' + str(positive_len / content_len) +'\n' with open(self.new_file + '10', 'w') as fp: fp.write(review_txt) def save_f11(self): review_txt = "" content_list = self.fu.get_content_list() negative_words = [] with open('../opinion-lexicon-English/negative-words.txt') as fp: negative_words = [word for word in fp.readlines()] print len(negative_words) with open(self.old_file + '10') as fp: for index, line in enumerate(fp.readlines()): content = content_list[index].split(' ') content_len = len(content) negative_len = 0.0 for word in content: if word.lower() in negative_words: negative_len += 1 print negative_len review_txt += line.replace('\n', '') + '\t' + str(negative_len / content_len) +'\n' with open(self.new_file + '11', 'w') as fp: fp.write(review_txt) def save_f12(self): review_txt = "" product_content_list = self.fu.get_column_list([1,-1]) product_feature_list = {} with open('../AmazonDataBackup/productInfoXML-reviewed-mProducts.features') as fp: for line in fp: product_id = line.split('\t')[0] product_feature = line.split('\t')[1] product_feature_list[product_id] = product_feature with open(self.old_file + '11') as fp: for index, line in enumerate(fp.readlines()): product_id = product_content_list[index][0] content = product_content_list[index][1].lower() product_feature = product_feature_list[product_id] cos_sim = get_cosine(text_to_vector(content), text_to_vector(product_feature)) review_txt += line.replace('\n', '') + '\t' + str(cos_sim) +'\n' with open(self.new_file + '12', 'w') as fp: fp.write(review_txt) def save_f13(self): review_txt = "" product_content_list = self.fu.get_column_list([1,-1]) p_b_dict = product_brand_dict('../AmazonDataBackup/productInfoXML-reviewed-mProducts.copy') with open(self.old_file + '12') as fp: for index, line in enumerate(fp.readlines()): product_id = product_content_list[index][0] content = product_content_list[index][1].lower() content = WORD.findall(content) counted_content = Counter(content) brand = p_b_dict[product_id] try: brand_num = counted_content[brand] review_txt += line.replace('\n', '') + '\t' + str(float(brand_num) / len(content)) +'\n' except: brand_num = 0 review_txt += line.replace('\n', '') + '\t' + '0\n' with open(self.new_file + '13', 'w') as fp: fp.write(review_txt) def save_f14(self): review_txt = "" content_list = self.fu.get_content_list() format = re.compile(r'\d+') with open(self.old_file + '13') as fp: for index, line in enumerate(fp.readlines()): content = content_list[index] number = len(format.findall(content)) content = re.compile(r'\w+').findall(content) if len(content): review_txt += line.replace('\n', '') + '\t' + str(float(number) / len(content)) +'\n' else: review_txt += line.replace('\n', '') + '\t' + '0\n' with open(self.new_file + '14', 'w') as fp: fp.write(review_txt) def save_f15(self): review_txt = "" content_list = self.fu.get_content_list() with open(self.old_file + '14') as fp: for index, line in enumerate(fp.readlines()): content = content_list[index] capital_num = sum(1 for c in content if c.isupper()) content = re.compile(r'\w+').findall(content) if len(content): review_txt += line.replace('\n', '') + '\t' + str(float(capital_num) / len(content)) +'\n' else: review_txt += line.replace('\n', '') + '\t' + '0\n' with open(self.new_file + '15', 'w') as fp: fp.write(review_txt) def save_f16(self): review_txt = "" content_list = self.fu.get_content_list() with open(self.old_file + '15') as fp: for index, line in enumerate(fp.readlines()): content = re.compile(r'\w+').findall(content_list[index]) capital_num = sum(1 for c in content if c.isupper()) # content = re.compile(r'\w+').findall(content) if len(content): review_txt += line.replace('\n', '') + '\t' + str(float(capital_num) / len(content)) +'\n' else: review_txt += line.replace('\n', '') + '\t' + '0\n' with open(self.new_file + '16', 'w') as fp: fp.write(review_txt) def save_f17(self): review_txt = "" rating_list = self.fu.get_rating_list() with open(self.old_file + '16') as fp: for index, line in enumerate(fp.readlines()): review_txt += line.replace('\n', '') + '\t' + str(rating_list[index]) +'\n' with open(self.new_file + '17', 'w') as fp: fp.write(review_txt) def save_f18(self): review_txt = "" product_rating_list = self.fu.get_column_list([1, 5]) product_avg_rating_dict = product_avg_rating(product_rating_list) with open(self.old_file + '17') as fp: for index, line in enumerate(fp.readlines()): product_id = product_rating_list[index][0] rating = product_rating_list[index][1] review_txt += line.replace('\n', '') + '\t' + str(float(rating) - product_avg_rating_dict[product_id]) +'\n' with open(self.new_file + '18', 'w') as fp: fp.write(review_txt) def save_f19(self): review_txt = "" rating_list = self.fu.get_rating_list() with open(self.old_file + '18') as fp: for index, line in enumerate(fp.readlines()): try: rating = float(rating_list[index]) if rating >= 4: review_txt += line.replace('\n', '') + '\t' + '1\n' elif rating <= 2.5: review_txt += line.replace('\n', '') + '\t' + '-1\n' else: review_txt += line.replace('\n', '') + '\t' + '0\n' except: print index with open(self.new_file + '19', 'w') as fp: fp.write(review_txt) def save_f20(self): review_txt = "" reviewer_product_date_list = self.fu.get_column_list([1,2]) review_txt = "" dict = {} for idx, reviewer_product_date in enumerate(reviewer_product_date_list): product = reviewer_product_date[0] date = reviewer_product_date[1] if product not in dict: dict[product] = {} try: dict[product][idx] = parse(date) except: print date rank_list = rank_dict(dict, False) with open(self.old_file + '19') as fp: features = fp.readlines() for index, line in enumerate(features): product_id = reviewer_product_date_list[index][0] rank = rank_list[product_id][index] rating_type = int(line.split('\t')[19]) if rank == 1 and rating_type == -1: # print rank_list[product_id] # print rank_list[product_id] # print index first_review_index = 0 for review in rank_list[product_id].keys(): if rank_list[product_id][review] == 0: first_review_index = review if int(features[first_review_index].split('\t')[19]) == 1: print index, first_review_index review_txt += line.replace('\n', '') + '\t' + '1\n' else: review_txt += line.replace('\n', '') + '\t' + '0\n' else: review_txt += line.replace('\n', '') + '\t' + '0\n' with open(self.new_file + '20', 'w') as fp: fp.write(review_txt) def save_f21(self): review_txt = "" reviewer_product_date_list = self.fu.get_column_list([1,2]) review_txt = "" dict = {} for idx, reviewer_product_date in enumerate(reviewer_product_date_list): product = reviewer_product_date[0] date = reviewer_product_date[1] if product not in dict: dict[product] = {} try: dict[product][idx] = parse(date) except: print date rank_list = rank_dict(dict, True) with open(self.old_file + '20') as fp: features = fp.readlines() for index, line in enumerate(features): product_id = reviewer_product_date_list[index][0] rank = rank_list[product_id][index] rating_type = int(line.split('\t')[19]) if rank == 1 and rating_type == 1: # print rank_list[product_id] # print rank_list[product_id] # print index first_review_index = 0 for review in rank_list[product_id].keys(): if rank_list[product_id][review] == 0: first_review_index = review if int(features[first_review_index].split('\t')[19]) == -1: print index, first_review_index review_txt += line.replace('\n', '') + '\t' + '1\n' else: review_txt += line.replace('\n', '') + '\t' + '0\n' else: review_txt += line.replace('\n', '') + '\t' + '0\n' with open(self.new_file + '21', 'w') as fp: fp.write(review_txt) def save_f22(self): review_txt = "" with open(self.old_file + '21') as fp: reviewers = {} lines = fp.readlines() for line in lines: features = line.split('\t') if not reviewers.has_key(features[0]): reviewers[features[0]] = {'review_num': 0, 'first_review_num': 0.0} if int(features[8]) == 1: reviewers[features[0]]['first_review_num'] += 1 reviewers[features[0]]['review_num'] += 1 # features = [line.split('\t') for line in lines] print 'finish features' for index, line in enumerate(lines): features = line.split('\t') review_id = features[0] review_num = reviewers[review_id]['review_num'] first_review_num = reviewers[review_id]['first_review_num'] if first_review_num > 0 and first_review_num != review_num: print index, first_review_num review_txt += lines[index].replace('\n', '') + '\t' + str(first_review_num / review_num) +'\n' with open(self.new_file + '22', 'w') as fp: fp.write(review_txt) def save_f23(self): review_txt = "" with open(self.old_file + '22') as fp: reviewers = {} lines = fp.readlines() for line in lines: features = line.split('\t') if not reviewers.has_key(features[0]): reviewers[features[0]] = {'review_num': 0, 'only_review_num': 0.0} if int(features[9]) == 1: reviewers[features[0]]['only_review_num'] += 1 reviewers[features[0]]['review_num'] += 1 # features = [line.split('\t') for line in lines] print 'finish features' for index, line in enumerate(lines): features = line.split('\t') review_id = features[0] review_num = reviewers[review_id]['review_num'] only_review_num = reviewers[review_id]['only_review_num'] if only_review_num > 0 and only_review_num != review_num: print index, only_review_num review_txt += lines[index].replace('\n', '') + '\t' + str(only_review_num / review_num) +'\n' with open(self.new_file + '23', 'w') as fp: fp.write(review_txt) def save_f24(self): review_txt = "" reviewer_rating_list = self.fu.get_column_list([0,5]) reviewers = {} for reviewer_rating in reviewer_rating_list: reviewer_id = reviewer_rating[0] rating = reviewer_rating[1] if not reviewers.has_key(reviewer_id): reviewers[reviewer_id] = {'ratings': [], 'avg_rating': 0.0} reviewers[reviewer_id]['ratings'].append(float(rating)) for reviewer_id in reviewers.keys(): ratings = reviewers[reviewer_id]['ratings'] reviewers[reviewer_id]['avg_rating'] = sum(ratings) / len(ratings) with open(self.old_file + '23') as fp: lines = fp.readlines() for index, line in enumerate(lines): features = line.split('\t') reviewer_id = features[0] review_txt += lines[index].replace('\n', '') + '\t' + str(reviewers[reviewer_id]['avg_rating']) +'\n' with open(self.new_file + '24', 'w') as fp: fp.write(review_txt) def save_f25(self): review_txt = "" reviewer_rating_list = self.fu.get_column_list([0,5]) reviewers = {} for reviewer_rating in reviewer_rating_list: reviewer_id = reviewer_rating[0] rating = reviewer_rating[1] if not reviewers.has_key(reviewer_id): reviewers[reviewer_id] = {'ratings': [], 'avg_rating': 0.0, 'std_rating': 0.0} reviewers[reviewer_id]['ratings'].append(float(rating)) for reviewer_id in reviewers.keys(): ratings = reviewers[reviewer_id]['ratings'] reviewers[reviewer_id]['avg_rating'] = sum(ratings) / len(ratings) for reviewer_id in reviewers.keys(): ratings = reviewers[reviewer_id]['ratings'] avg_rating = reviewers[reviewer_id]['avg_rating'] std_rating = math.sqrt(sum([ (rating - avg_rating)**2 for rating in ratings ])) reviewers[reviewer_id]['std_rating'] = std_rating with open(self.old_file + '24') as fp: lines = fp.readlines() for index, line in enumerate(lines): features = line.split('\t') reviewer_id = features[0] review_txt += lines[index].replace('\n', '') + '\t' + str(reviewers[reviewer_id]['std_rating']) +'\n' with open(self.new_file + '25', 'w') as fp: fp.write(review_txt) def save_f26(self): review_txt = "" reviewer_rating_list = self.fu.get_column_list([0,5]) reviewers = {} for reviewer_rating in reviewer_rating_list: reviewer_id = reviewer_rating[0] rating = float(reviewer_rating[1]) if not reviewers.has_key(reviewer_id): reviewers[reviewer_id] = [] rating_flag = 1 if rating >= 4: rating_flag = 1 elif rating < 2.5: rating_flag = -1 else: rating_flag = 0 reviewers[reviewer_id].append(rating_flag) with open(self.old_file + '25') as fp: lines = fp.readlines() for index, line in enumerate(lines): features = line.split('\t') reviewer_id = features[0] if all_same(reviewers[reviewer_id]): review_txt += lines[index].replace('\n', '') + '\t' +'1\n' else: review_txt += lines[index].replace('\n', '') + '\t' +'0\n' with open(self.new_file + '26', 'w') as fp: fp.write(review_txt) def save_f27(self): review_txt = "" reviewer_rating_list = self.fu.get_column_list([0,5]) reviewers = {} for reviewer_rating in reviewer_rating_list: reviewer_id = reviewer_rating[0] rating = float(reviewer_rating[1]) if not reviewers.has_key(reviewer_id): reviewers[reviewer_id] = {'good': False, 'avg': False, 'bad': False} if rating >= 4: reviewers[reviewer_id]['good'] = True elif rating < 2.5: reviewers[reviewer_id]['bad'] = True else: reviewers[reviewer_id]['avg'] = True with open(self.old_file + '26') as fp: lines = fp.readlines() for index, line in enumerate(lines): features = line.split('\t') reviewer_id = features[0] if reviewers[reviewer_id]['good'] and reviewers[reviewer_id]['bad'] and not reviewers[reviewer_id]['avg']: print index review_txt += lines[index].replace('\n', '') + '\t' +'1\n' else: review_txt += lines[index].replace('\n', '') + '\t' +'0\n' with open(self.new_file + '27', 'w') as fp: fp.write(review_txt) def save_f28(self): review_txt = "" reviewer_rating_list = self.fu.get_column_list([0,5]) reviewers = {} for reviewer_rating in reviewer_rating_list: reviewer_id = reviewer_rating[0] rating = float(reviewer_rating[1]) if not reviewers.has_key(reviewer_id): reviewers[reviewer_id] = {'good': False, 'avg': False, 'bad': False} if rating >= 4: reviewers[reviewer_id]['good'] = True elif rating < 2.5: reviewers[reviewer_id]['bad'] = True else: reviewers[reviewer_id]['avg'] = True with open(self.old_file + '27') as fp: lines = fp.readlines() for index, line in enumerate(lines): features = line.split('\t') reviewer_id = features[0] if reviewers[reviewer_id]['good'] and reviewers[reviewer_id]['avg'] and not reviewers[reviewer_id]['bad']: print index review_txt += lines[index].replace('\n', '') + '\t' +'1\n' else: review_txt += lines[index].replace('\n', '') + '\t' +'0\n' with open(self.new_file + '28', 'w') as fp: fp.write(review_txt) def save_f29(self): review_txt = "" reviewer_rating_list = self.fu.get_column_list([0,5]) reviewers = {} for reviewer_rating in reviewer_rating_list: reviewer_id = reviewer_rating[0] rating = float(reviewer_rating[1]) if not reviewers.has_key(reviewer_id): reviewers[reviewer_id] = {'good': False, 'avg': False, 'bad': False} if rating >= 4: reviewers[reviewer_id]['good'] = True elif rating < 2.5: reviewers[reviewer_id]['bad'] = True else: reviewers[reviewer_id]['avg'] = True with open(self.old_file + '28') as fp: lines = fp.readlines() for index, line in enumerate(lines): features = line.split('\t') reviewer_id = features[0] if reviewers[reviewer_id]['bad'] and reviewers[reviewer_id]['avg'] and not reviewers[reviewer_id]['good']: print index review_txt += lines[index].replace('\n', '') + '\t' +'1\n' else: review_txt += lines[index].replace('\n', '') + '\t' +'0\n' with open(self.new_file + '29', 'w') as fp: fp.write(review_txt) def save_f30(self): review_txt = "" reviewer_rating_list = self.fu.get_column_list([0,5]) reviewers = {} for reviewer_rating in reviewer_rating_list: reviewer_id = reviewer_rating[0] rating = float(reviewer_rating[1]) if not reviewers.has_key(reviewer_id): reviewers[reviewer_id] = {'good': False, 'avg': False, 'bad': False} if rating >= 4: reviewers[reviewer_id]['good'] = True elif rating < 2.5: reviewers[reviewer_id]['bad'] = True else: reviewers[reviewer_id]['avg'] = True with open(self.old_file + '29') as fp: lines = fp.readlines() for index, line in enumerate(lines): features = line.split('\t') reviewer_id = features[0] if reviewers[reviewer_id]['bad'] and reviewers[reviewer_id]['avg'] and reviewers[reviewer_id]['good']: print index review_txt += lines[index].replace('\n', '') + '\t' +'1\n' else: review_txt += lines[index].replace('\n', '') + '\t' +'0\n' with open(self.new_file + '30', 'w') as fp: fp.write(review_txt) def save_f31(self): review_txt = "" reviewers = {} with open(self.old_file + '30') as fp: for features in fp.readlines(): features = features.split('\t') reviewer_id = features[0] if not reviewers.has_key(reviewer_id): reviewers[reviewer_id] = {'total': 0.0, 'first': 0.0} reviewers[reviewer_id]['total'] += 1 if int(features[20]): reviewers[reviewer_id]['first'] += 1 with open(self.old_file + '30') as fp: lines = fp.readlines() for index, line in enumerate(lines): features = line.split('\t') reviewer_id = features[0] review_txt += lines[index].replace('\n', '') + '\t' + str(reviewers[reviewer_id]['first'] / reviewers[reviewer_id]['total']) +'\n' if reviewers[reviewer_id]['first'] and reviewers[reviewer_id]['first'] != 1: print index with open(self.new_file + '31', 'w') as fp: fp.write(review_txt) def save_f32(self): review_txt = "" reviewers = {} with open(self.old_file + '31') as fp: for features in fp.readlines(): features = features.split('\t') reviewer_id = features[0] if not reviewers.has_key(reviewer_id): reviewers[reviewer_id] = {'total': 0.0, 'first': 0.0} reviewers[reviewer_id]['total'] += 1 if int(features[21]): reviewers[reviewer_id]['first'] += 1 with open(self.old_file + '31') as fp: lines = fp.readlines() for index, line in enumerate(lines): features = line.split('\t') reviewer_id = features[0] review_txt += lines[index].replace('\n', '') + '\t' + str(reviewers[reviewer_id]['first'] / reviewers[reviewer_id]['total']) +'\n' if reviewers[reviewer_id]['first'] and reviewers[reviewer_id]['first'] != 1: print index with open(self.new_file + '32', 'w') as fp: fp.write(review_txt) def save_f33(self): review_txt = "" product_price = product_price_dict('../AmazonDataBackup/productInfoXML-reviewed-mProducts.copy') product_list = self.fu.get_productId_list() with open(self.old_file + '32') as fp: lines = fp.readlines() for index, line in enumerate(lines): product_id = product_list[index] review_txt += lines[index].replace('\n', '') + '\t' + str(product_price[product_id]) +'\n' with open(self.new_file + '33', 'w') as fp: fp.write(review_txt) def save_f34(self): review_txt = "" product_rank = product_rank_dict('../AmazonDataBackup/productInfoXML-reviewed-mProducts.copy') product_list = self.fu.get_productId_list() with open(self.old_file + '33') as fp: lines = fp.readlines() for index, line in enumerate(lines): product_id = product_list[index] review_txt += lines[index].replace('\n', '') + '\t' + str(product_rank[product_id]) +'\n' with open(self.new_file + '34', 'w') as fp: fp.write(review_txt) def save_f35(self): review_txt = "" product_list = self.fu.get_productId_list() product_rank = product_rank_dict('../AmazonDataBackup/productInfoXML-reviewed-mProducts.copy') product_rating_list = self.fu.get_column_list([1, 5]) products = {} for product_rating in product_rating_list: product_id = product_rating[0] rating = float(product_rating[1]) if not products.has_key(product_id): products[product_id] = {'ratings': [], 'avg': 0.0} products[product_id]['ratings'].append(rating) for product_id in products.keys(): products[product_id]['avg'] = sum(products[product_id]['ratings']) / len(products[product_id]['ratings']) with open(self.old_file + '34') as fp: lines = fp.readlines() for index, line in enumerate(lines): product_id = product_list[index] review_txt += lines[index].replace('\n', '') + '\t' + str(products[product_id]['avg']) +'\n' if index == 0: print products[product_id] with open(self.new_file + '35', 'w') as fp: fp.write(review_txt) def save_f36(self): review_txt = "" product_list = self.fu.get_productId_list() product_rank = product_rank_dict('../AmazonDataBackup/productInfoXML-reviewed-mProducts.copy') product_rating_list = self.fu.get_column_list([1, 5]) products = {} for product_rating in product_rating_list: product_id = product_rating[0] rating = float(product_rating[1]) if not products.has_key(product_id): products[product_id] = {'ratings': [], 'avg': 0.0, 'std': 0.0} products[product_id]['ratings'].append(rating) for product_id in products.keys(): products[product_id]['avg'] = sum(products[product_id]['ratings']) / len(products[product_id]['ratings']) for product_id in products.keys(): avg = products[product_id]['avg'] std = math.sqrt(sum([(rating - avg)**2 for rating in products[product_id]['ratings']])) products[product_id]['std'] = std with open(self.old_file + '35') as fp: lines = fp.readlines() for index, line in enumerate(lines): product_id = product_list[index] review_txt += lines[index].replace('\n', '') + '\t' + str(products[product_id]['std']) +'\n' if index == 0: print products[product_id] with open(self.new_file + '36', 'w') as fp: fp.write(review_txt) def save_labels(self): review_txt = "" content_list = self.fu.get_content_list() print 'get content list' grams_list = [] for content in content_list: grams_list.append(get_2_grams(content)) print 'get grams list' label_list = [] content_len = len(content_list) for x in xrange(0,content_len): label_list.append(0) print 'start labeling' for i in xrange(0,content_len): grams_a = grams_list[i] for j in xrange(i+1,content_len): grams_b = grams_list[j] sim = jaccard_distance(grams_a, grams_b) if sim >= 0.9: print "sim is : " , sim label_list[i] = 1 label_list[j] = 1 with open(self.old_file + '36') as fp: lines = fp.readlines() for index, line in enumerate(lines): product_id = product_list[index] review_txt += lines[index].replace('\n', '') + '\t' + str(label_list[index]) +'\n' with open(self.new_file + '37', 'w') as fp: fp.write(review_txt)