def preprocessTweetsAndSave(args, prep): with open(args.tempFile, "w") as f: for tweet in readFile( args.filename, columns=args.filenameColumns, sep=args.filenameDelimiter)[args.dataColumnName].values: # some tweets in the file reduced-tweets.parquet were None if tweet is not None: # tweets.append(prep.tokenize(tweet)) tweet = prep.replace_hashtags_URL_USER(tweet, mode_URL="replace", mode_Mentions="replace") f.write((" ".join(prep.tokenize(tweet))) + "\n") f.close()
retweeted_place_place_type, retweeted_created_at, retweeted_tweet_longitude, retweeted_tweet_latitude, \ retweeted_text)", type=json.loads, default={}) args = parser.parse_args() if args.mode == "local": # check from which source to read the data if args.filename is not None: print("Local mode: Read file..") filtered_df = filter_dataframe( readFile(args.filename, columns=args.filenameColumns, sep=args.filenameDelimiter), args.configDict, language=args.lang, withRetweets=args.withRetweets, withOriginalTweetOfRetweet=args.withOriginalTweetOfRetweet, deleteDuplicates=True) print("Save result to {} ..".format(args.saveResultPath)) savePandasDFtoFile(filtered_df, args.saveResultPath) # Check if necessary arguments are given elif args.localMongoDatabase is None and args.localMongoCollection is None: sys.stderr.write( "ERROR: A MongoDB database and collection need to be provided to extract the data" )
editDistPos = 9 editSenDistPos =10 bleuPos = 11 metScores = [] terScores = [] mtBleuScores = [] mtNistScores = [] mtNormedNist = [] beerScores =[] corpusEvalBleu = None corpusEvalNist = None try: refList = rw.readFile(args.ref) srcList = rw.readFile(args.src) hypList = rw.readFile(args.hyp) if len(refList) == len(hypList) and len(refList) == len(srcList): pass else: raise UnboundLocalError except IOError as io: print('There was a problem Loading your named files...') print(io) print('Please check and try again...') print('Exiting...') sys.exit() except UnboundLocalError as ule: print('Your given files have different lengths, please check and try again')
parser.add_argument("-s", "--pathSave", help="Path to save personal database to (.parquet or .csv)") parser.add_argument("-tn", "--columnNameTextData", help="Column name of the text data", default="tweet") parser.add_argument("-unc", "--userNameColumn", help="Give the user_name column (default: user_screen_name)", default="user_screen_name") args = parser.parse_args() print("Load user classifier..") #loaded_model = joblib.load(filename) model_user_classif = joblib.load(args.pathUserClassifier) print("Load tweet classifier..") model_tweet_classif = joblib.load(args.pathTweetClassifier) print("Load word embedding..") wordEmbedding = FastText.load(args.pathWordEmbedding) print("Load data..") tweets = readFile(args.pathData) print("Classify all tweets of an user and exclude all users with a mean score < {} ...".format(args.scorePersonalMinimum)) tweets_user_pers = score_users(tweets, model_user_classif, wordEmbedding, args.userNameColumn, score_personal_minimum=args.scorePersonalMinimum, textColumn=args.columnNameTextData) print("Number tweets personal users:", len(tweets_user_pers)) print("Classify only personal tweets of personal users..") tweets_personal = get_personal_tweets(tweets_user_pers, model_tweet_classif, wordEmbedding, textColumn=args.columnNameTextData) print("Number personal tweets:", len(tweets_personal)) print(tweets_personal.head()) print("Save personal tweets to file {} ...".format(args.pathSave)) savePandasDFtoFile(tweets_personal, args.pathSave)
"--hyp", help="the hypothesis file to be used (required)") parser.add_argument("-s", "--src", help="the src file to be used (required)") args = parser.parse_args() outList = [] head = '<srcset setid="testName" srclang="any">\n' docHead = '<doc sysid="src" docid="evaluation" genre="any" origlang="zh">\n' pIn = '<p>\n' pOut = '</p>\n' docOut = '</doc>\n' endFile = '</srcset>' for fin in [args.ref, args.hyp, args.src]: inList = rw.readFile(fin) outList = [] outList.append(head) outList.append(docHead) outList.append(pIn) for i, line in enumerate(inList): outList.append('<seg id="{0}">{1}</seg>\n'.format(i + 1, line.strip())) outList.append(pOut) outList.append(docOut) outList.append(endFile) rw.writeFile(outList, fin.replace('.txt', '_mt_seg.txt')) print('DONE')