def preprocessTweetsAndSave(args, prep):

    with open(args.tempFile, "w") as f:
        for tweet in readFile(
                args.filename,
                columns=args.filenameColumns,
                sep=args.filenameDelimiter)[args.dataColumnName].values:
            # some tweets in the file reduced-tweets.parquet were None
            if tweet is not None:
                #        tweets.append(prep.tokenize(tweet))
                tweet = prep.replace_hashtags_URL_USER(tweet,
                                                       mode_URL="replace",
                                                       mode_Mentions="replace")
                f.write((" ".join(prep.tokenize(tweet))) + "\n")

    f.close()
Beispiel #2
0
                                                     retweeted_place_place_type, retweeted_created_at, retweeted_tweet_longitude, retweeted_tweet_latitude, \
                                                     retweeted_text)",
        type=json.loads,
        default={})
    args = parser.parse_args()

    if args.mode == "local":

        # check from which source to read the data

        if args.filename is not None:
            print("Local mode: Read file..")

            filtered_df = filter_dataframe(
                readFile(args.filename,
                         columns=args.filenameColumns,
                         sep=args.filenameDelimiter),
                args.configDict,
                language=args.lang,
                withRetweets=args.withRetweets,
                withOriginalTweetOfRetweet=args.withOriginalTweetOfRetweet,
                deleteDuplicates=True)

            print("Save result to {} ..".format(args.saveResultPath))
            savePandasDFtoFile(filtered_df, args.saveResultPath)

        # Check if necessary arguments are given
        elif args.localMongoDatabase is None and args.localMongoCollection is None:
            sys.stderr.write(
                "ERROR: A MongoDB database and collection need to be provided to extract the data"
            )
 editDistPos = 9
 editSenDistPos =10
 bleuPos = 11
 
 metScores = []
 terScores = []
 mtBleuScores = []
 mtNistScores = []
 mtNormedNist = []
 beerScores =[]
 
 corpusEvalBleu = None
 corpusEvalNist = None
 
 try:
     refList = rw.readFile(args.ref)
     srcList = rw.readFile(args.src)
     hypList = rw.readFile(args.hyp)
     
     if len(refList) == len(hypList) and len(refList) == len(srcList):
         pass
     else:
         raise UnboundLocalError
 except IOError as io:
     print('There was a problem Loading your named files...')
     print(io)
     print('Please check and try again...')
     print('Exiting...')
     sys.exit()
 except UnboundLocalError as ule:
     print('Your given files have different lengths, please check and try again')
Beispiel #4
0
    parser.add_argument("-s", "--pathSave", help="Path to save personal database to (.parquet or .csv)")
    parser.add_argument("-tn", "--columnNameTextData", help="Column name of the text data", default="tweet")
    parser.add_argument("-unc", "--userNameColumn", help="Give the user_name column (default: user_screen_name)", default="user_screen_name")

    args = parser.parse_args()

    print("Load user classifier..")
    #loaded_model = joblib.load(filename)
    model_user_classif = joblib.load(args.pathUserClassifier)

    print("Load tweet classifier..")
    model_tweet_classif = joblib.load(args.pathTweetClassifier)

    print("Load word embedding..")
    wordEmbedding = FastText.load(args.pathWordEmbedding)

    print("Load data..")
    tweets = readFile(args.pathData)

    print("Classify all tweets of an user and exclude all users with a mean score < {} ...".format(args.scorePersonalMinimum))
    tweets_user_pers = score_users(tweets, model_user_classif, wordEmbedding, args.userNameColumn, score_personal_minimum=args.scorePersonalMinimum, textColumn=args.columnNameTextData)
    print("Number tweets personal users:", len(tweets_user_pers))

    print("Classify only personal tweets of personal users..")
    tweets_personal = get_personal_tweets(tweets_user_pers, model_tweet_classif, wordEmbedding, textColumn=args.columnNameTextData)
    print("Number personal tweets:", len(tweets_personal))
    print(tweets_personal.head())

    print("Save personal tweets to file {}  ...".format(args.pathSave))
    savePandasDFtoFile(tweets_personal, args.pathSave)
Beispiel #5
0
                    "--hyp",
                    help="the hypothesis file to be used (required)")
parser.add_argument("-s", "--src", help="the src file to be used (required)")

args = parser.parse_args()

outList = []

head = '<srcset setid="testName" srclang="any">\n'
docHead = '<doc sysid="src" docid="evaluation" genre="any" origlang="zh">\n'
pIn = '<p>\n'
pOut = '</p>\n'
docOut = '</doc>\n'
endFile = '</srcset>'

for fin in [args.ref, args.hyp, args.src]:
    inList = rw.readFile(fin)
    outList = []
    outList.append(head)
    outList.append(docHead)
    outList.append(pIn)

    for i, line in enumerate(inList):
        outList.append('<seg id="{0}">{1}</seg>\n'.format(i + 1, line.strip()))

    outList.append(pOut)
    outList.append(docOut)
    outList.append(endFile)

    rw.writeFile(outList, fin.replace('.txt', '_mt_seg.txt'))
    print('DONE')